Source code for data_juicer
-__version__ = '1.0.0'
+__version__ = '1.0.1'
import os
import subprocess
@@ -123,12 +123,18 @@ Source code for data_juicer
_CUDA_DEVICE_COUNT = _cuda_device_count()
-
diff --git a/_modules/data_juicer/analysis/column_wise_analysis.html b/_modules/data_juicer/analysis/column_wise_analysis.html
index 06c048f1f..caa40b8d6 100644
--- a/_modules/data_juicer/analysis/column_wise_analysis.html
+++ b/_modules/data_juicer/analysis/column_wise_analysis.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.analysis.column_wise_analysis — data_juicer 1.0.0 documentation
+ data_juicer.analysis.column_wise_analysis — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -130,10 +130,14 @@ Source code for data_juicer.analysis.column_wise_analysis
return int(now_row), int(now_col), grids
-[docs]class ColumnWiseAnalysis:
+
+[docs]
+class ColumnWiseAnalysis:
"""Apply analysis on each column of stats respectively."""
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
dataset,
output_path,
overall_result=None,
@@ -160,7 +164,10 @@ Source code for data_juicer.analysis.column_wise_analysis
self.save_stats_in_one_file = save_stats_in_one_file
-[docs] def analyze(self, show_percentiles=False, show=False, skip_export=False):
+
+
+[docs]
+ def analyze(self, show_percentiles=False, show=False, skip_export=False):
"""
Apply analysis and draw the analysis figure for stats.
@@ -268,7 +275,10 @@ Source code for data_juicer.analysis.column_wise_analysis
# TODO: (fixme) the saved png sometime are blank
plt.clf()
-[docs] def draw_hist(self, ax, data, save_path, percentiles=None, show=False):
+
+
+[docs]
+ def draw_hist(self, ax, data, save_path, percentiles=None, show=False):
"""
Draw the histogram for the data.
@@ -329,7 +339,10 @@ Source code for data_juicer.analysis.column_wise_analysis
# add a little rotation on labels of x axis to avoid overlapping
ax.tick_params(axis='x', rotation=25)
-[docs] def draw_box(self, ax, data, save_path, percentiles=None, show=False):
+
+
+[docs]
+ def draw_box(self, ax, data, save_path, percentiles=None, show=False):
"""
Draw the box plot for the data.
@@ -375,7 +388,9 @@ Source code for data_juicer.analysis.column_wise_analysis
# if no showing, we need to clear this axes to avoid
# accumulated overlapped figures in different draw_xxx function
# calling
- ax.clear()
+ ax.clear()
+
+
diff --git a/_modules/data_juicer/analysis/diversity_analysis.html b/_modules/data_juicer/analysis/diversity_analysis.html
index a6ff5fe2b..fa2276840 100644
--- a/_modules/data_juicer/analysis/diversity_analysis.html
+++ b/_modules/data_juicer/analysis/diversity_analysis.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.analysis.diversity_analysis — data_juicer 1.0.0 documentation
+ data_juicer.analysis.diversity_analysis — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -161,11 +161,15 @@ Source code for data_juicer.analysis.diversity_analysis
return df
-[docs]class DiversityAnalysis:
+
+[docs]
+class DiversityAnalysis:
"""Apply diversity analysis for each sample and get an overall analysis
result."""
-[docs] def __init__(self, dataset, output_path, lang_or_model='en'):
+
+[docs]
+ def __init__(self, dataset, output_path, lang_or_model='en'):
"""Initialization method :param dataset: the dataset to be analyzed
:param output_path: path to store the analysis results :param
lang_or_model: the diversity model or a specific language used to load
@@ -177,7 +181,10 @@ Source code for data_juicer.analysis.diversity_analysis
os.makedirs(self.output_path)
self.lang_or_model = lang_or_model
-[docs] def compute(self, lang_or_model=None, column_name='text'):
+
+
+[docs]
+ def compute(self, lang_or_model=None, column_name='text'):
"""
Apply lexical tree analysis on each sample.
@@ -208,7 +215,10 @@ Source code for data_juicer.analysis.diversity_analysis
dataset = self.dataset.map(find_verb_noun)
return pd.DataFrame(dataset)
-[docs] def analyze(self,
+
+
+[docs]
+ def analyze(self,
lang_or_model=None,
column_name='text',
postproc_func=get_diversity,
@@ -234,7 +244,9 @@ Source code for data_juicer.analysis.diversity_analysis
df.to_csv(os.path.join(self.output_path, 'diversity.csv'))
df.to_markdown(os.path.join(self.output_path, 'diversity.md'))
- return df
+ return df
+
+
diff --git a/_modules/data_juicer/analysis/overall_analysis.html b/_modules/data_juicer/analysis/overall_analysis.html
index 8680711df..e8962830a 100644
--- a/_modules/data_juicer/analysis/overall_analysis.html
+++ b/_modules/data_juicer/analysis/overall_analysis.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.analysis.overall_analysis — data_juicer 1.0.0 documentation
+ data_juicer.analysis.overall_analysis — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -92,11 +92,15 @@ Source code for data_juicer.analysis.overall_analysis
return col_overall
-[docs]class OverallAnalysis:
+
+[docs]
+class OverallAnalysis:
"""Apply analysis on the overall stats, including mean, std, quantiles,
etc."""
-[docs] def __init__(self, dataset, output_path):
+
+[docs]
+ def __init__(self, dataset, output_path):
"""
Initialization method.
@@ -117,7 +121,10 @@ Source code for data_juicer.analysis.overall_analysis
# {numbers, string, list of one of before}
self.supported_object_types = {str, list}
-[docs] def refine_single_column(self, col):
+
+
+[docs]
+ def refine_single_column(self, col):
if col.dtype != 'object':
# not an object, return directly
return col
@@ -137,7 +144,10 @@ Source code for data_juicer.analysis.overall_analysis
col = col.explode().infer_objects()
return col
-[docs] def analyze(self, percentiles=[], num_proc=1, skip_export=False):
+
+
+[docs]
+ def analyze(self, percentiles=[], num_proc=1, skip_export=False):
"""
Apply overall analysis on the whole dataset based on the describe
method of pandas.
@@ -171,7 +181,9 @@ Source code for data_juicer.analysis.overall_analysis
overall.to_csv(os.path.join(self.output_path, 'overall.csv'))
overall.to_markdown(os.path.join(self.output_path, 'overall.md'))
- return overall
+ return overall
+
+
diff --git a/_modules/data_juicer/config/config.html b/_modules/data_juicer/config/config.html
index 85efc839c..76ef5041a 100644
--- a/_modules/data_juicer/config/config.html
+++ b/_modules/data_juicer/config/config.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.config.config — data_juicer 1.0.0 documentation
+ data_juicer.config.config — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -102,7 +102,9 @@ Source code for data_juicer.config.config
global_parser = None
-[docs]def init_configs(args: Optional[List[str]] = None):
+
+[docs]
+def init_configs(args: Optional[List[str]] = None):
"""
initialize the jsonargparse parser and parse configs from one of:
1. POSIX-style commands line args;
@@ -309,6 +311,12 @@ Source code for data_juicer.config.config
help='The compression method of the cache file, which can be'
'specified in ["gzip", "zstd", "lz4"]. If this parameter is'
'None, the cache file will not be compressed.')
+ parser.add_argument(
+ '--open_monitor',
+ type=bool,
+ default=True,
+ help='Whether to open the monitor to trace resource utilization for '
+ 'each OP during data processing. It\'s True in default.')
parser.add_argument(
'--use_checkpoint',
type=bool,
@@ -433,6 +441,7 @@ Source code for data_juicer.config.config
logger.error('Config initialization failed')
+
def update_ds_cache_dir_and_related_vars(new_ds_cache_path):
from pathlib import Path
@@ -759,7 +768,9 @@ Source code for data_juicer.config.config
print(table)
-[docs]def export_config(cfg: Namespace,
+
+[docs]
+def export_config(cfg: Namespace,
path: str,
format: str = 'yaml',
skip_none: bool = True,
@@ -801,7 +812,10 @@ Source code for data_juicer.config.config
logger.info(f'Saved the configuration in {path}')
-[docs]def merge_config(ori_cfg: Namespace, new_cfg: Namespace):
+
+
+[docs]
+def merge_config(ori_cfg: Namespace, new_cfg: Namespace):
"""
Merge configuration from new_cfg into ori_cfg
@@ -859,7 +873,10 @@ Source code for data_juicer.config.config
logger.error('Config merge failed')
-[docs]def prepare_side_configs(ori_config: Union[str, Namespace, Dict]):
+
+
+[docs]
+def prepare_side_configs(ori_config: Union[str, Namespace, Dict]):
"""
parse the config if ori_config is a string of a config file path with
yaml, yml or json format
@@ -891,7 +908,10 @@ Source code for data_juicer.config.config
return config
-[docs]def get_init_configs(cfg: Union[Namespace, Dict]):
+
+
+[docs]
+def get_init_configs(cfg: Union[Namespace, Dict]):
"""
set init configs of datajucer for cfg
"""
@@ -904,6 +924,7 @@ Source code for data_juicer.config.config
json.dump(cfg, f)
inited_dj_cfg = init_configs(['--config', temp_file])
return inited_dj_cfg
+
diff --git a/_modules/data_juicer/core/adapter.html b/_modules/data_juicer/core/adapter.html
index 1ca13cc18..eee2749b2 100644
--- a/_modules/data_juicer/core/adapter.html
+++ b/_modules/data_juicer/core/adapter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.core.adapter — data_juicer 1.0.0 documentation
+ data_juicer.core.adapter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -85,15 +85,22 @@ Source code for data_juicer.core.adapter
from data_juicer.utils.process_utils import setup_mp
-[docs]class Adapter:
+
+[docs]
+class Adapter:
MAX_BATCH_SIZE = 10000
-[docs] def __init__(self, cfg: dict):
+
+[docs]
+ def __init__(self, cfg: dict):
self.cfg = cfg
self.idle_resources = Monitor.monitor_current_resources()
-[docs] @staticmethod
+
+
+[docs]
+ @staticmethod
def execute_and_probe(dataset, operators, sample_interval=0.5):
"""
Process the input dataset and probe related information for each OP in
@@ -149,7 +156,10 @@ Source code for data_juicer.core.adapter
return resource_util_list
-[docs] @staticmethod
+
+
+[docs]
+ @staticmethod
def take_batch(dataset, config):
"""
Split the dataset into batches based on configuration and load factor.
@@ -170,7 +180,10 @@ Source code for data_juicer.core.adapter
else:
return dataset.take(batch_size)
-[docs] def adapt_workloads(self, dataset, operators):
+
+
+[docs]
+ def adapt_workloads(self, dataset, operators):
"""
Manage the scheduling and load balancing for the dataset processing.
@@ -187,7 +200,10 @@ Source code for data_juicer.core.adapter
return bs_per_op
-[docs] def probe_small_batch(self, dataset, operators):
+
+
+[docs]
+ def probe_small_batch(self, dataset, operators):
"""
Perform small batch pre-execution to probe available resources,
current load and estimated OP speed, returning load factors and speed
@@ -220,7 +236,10 @@ Source code for data_juicer.core.adapter
return analysis_res, len(data_batch)
-[docs] def batch_size_strategy(self, load_analysis_res, base_bs=1, util_th=0.9):
+
+
+[docs]
+ def batch_size_strategy(self, load_analysis_res, base_bs=1, util_th=0.9):
"""
Decide the batch size for each op according to their workload analysis
result and expected utilization threshold. We need to guarantee that
@@ -255,7 +274,9 @@ Source code for data_juicer.core.adapter
self.MAX_BATCH_SIZE)
batch_size_per_op.append(bs_this_op)
- return batch_size_per_op
+ return batch_size_per_op
+
+
diff --git a/_modules/data_juicer/core/analyzer.html b/_modules/data_juicer/core/analyzer.html
index 727834f6d..de93e44a0 100644
--- a/_modules/data_juicer/core/analyzer.html
+++ b/_modules/data_juicer/core/analyzer.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.core.analyzer — data_juicer 1.0.0 documentation
+ data_juicer.core.analyzer — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -95,7 +95,9 @@ Source code for data_juicer.core.analyzer
from .exporter import Exporter
-[docs]class Analyzer:
+
+[docs]
+class Analyzer:
"""
This Analyzer class is used to analyze a specific dataset.
@@ -106,7 +108,9 @@ Source code for data_juicer.core.analyzer
dataset better.
"""
-[docs] def __init__(self, cfg: Optional[Namespace] = None):
+
+[docs]
+ def __init__(self, cfg: Optional[Namespace] = None):
"""
Initialization method.
@@ -149,7 +153,10 @@ Source code for data_juicer.core.analyzer
self.overall_single_plot_path = None
self.analysis_path = os.path.join(self.cfg.work_dir, 'analysis')
-[docs] def run(self,
+
+
+[docs]
+ def run(self,
load_data_np: Optional[PositiveInt] = None,
skip_export: bool = False,
skip_return: bool = False):
@@ -229,7 +236,9 @@ Source code for data_juicer.core.analyzer
column_wise_analysis.analyze(skip_export=skip_export)
if not skip_return:
- return dataset
+ return dataset
+
+
diff --git a/_modules/data_juicer/core/data.html b/_modules/data_juicer/core/data.html
index c58c269b9..175f6034d 100644
--- a/_modules/data_juicer/core/data.html
+++ b/_modules/data_juicer/core/data.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.core.data — data_juicer 1.0.0 documentation
+ data_juicer.core.data — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -220,10 +220,14 @@ Source code for data_juicer.core.data
return super().map(**args)
-[docs]class NestedDataset(Dataset, DJDataset):
+
+[docs]
+class NestedDataset(Dataset, DJDataset):
"""Enhanced HuggingFace-Dataset for better usability and efficiency."""
-[docs] def __init__(self, *args, **kargs):
+
+[docs]
+ def __init__(self, *args, **kargs):
if len(args) == 1 and isinstance(args[0], Dataset):
# init from another Dataset instance
self.__dict__ = copy.copy(args[0].__dict__)
@@ -233,6 +237,7 @@ Source code for data_juicer.core.data
self.need_to_cleanup_caches = not is_caching_enabled()
+
def __getitem__(self, key):
if isinstance(key, str):
# to index columns by query as string name(s)
@@ -243,13 +248,18 @@ Source code for data_juicer.core.data
res = super().__getitem__(key)
return nested_obj_factory(res)
-[docs] def process(self,
- operators,
- *,
- work_dir=None,
- exporter=None,
- checkpointer=None,
- tracer=None):
+
+[docs]
+ def process(
+ self,
+ operators,
+ *,
+ work_dir=None,
+ exporter=None,
+ checkpointer=None,
+ tracer=None,
+ open_monitor=True,
+ ):
if operators is None:
return self
@@ -258,7 +268,8 @@ Source code for data_juicer.core.data
unforkable_operators = set(UNFORKABLE.modules.keys())
# resource utilization monitor
- resource_util_list = []
+ if open_monitor:
+ resource_util_list = []
dataset = self
try:
@@ -275,12 +286,16 @@ Source code for data_juicer.core.data
'exporter': exporter,
'tracer': tracer,
}
- dataset, resource_util_per_op = Monitor.monitor_func(
- op.run, args=run_args)
+ if open_monitor:
+ dataset, resource_util_per_op = Monitor.monitor_func(
+ op.run, args=run_args)
+ else:
+ dataset = op.run(**run_args)
# record processed ops
if checkpointer is not None:
checkpointer.record(op._op_cfg)
- resource_util_list.append(resource_util_per_op)
+ if open_monitor:
+ resource_util_list.append(resource_util_per_op)
end = time()
logger.info(f'OP [{op._name}] Done in {end - start:.3f}s. '
f'Left {len(dataset)} samples.')
@@ -294,7 +309,10 @@ Source code for data_juicer.core.data
'last op...')
dataset.cleanup_cache_files()
checkpointer.save_ckpt(dataset)
- if work_dir:
+ if work_dir and open_monitor:
+ # get the analyzed version
+ resource_util_list = Monitor.analyze_resource_util_list(
+ resource_util_list)
monitor_dir = os.path.join(work_dir, 'monitor')
os.makedirs(monitor_dir, exist_ok=True)
with open(os.path.join(monitor_dir, 'monitor.json'),
@@ -304,9 +322,10 @@ Source code for data_juicer.core.data
monitor_dir)
return dataset
-[docs] def map(self, *args, **kargs):
- """Override the map func, which is called by most common operations,
- such that the processed samples can be accessed by nested manner."""
+
+
+[docs]
+ def update_args(self, args, kargs, is_filter=False):
if args:
args = list(args)
# the first positional para is function
@@ -332,15 +351,17 @@ Source code for data_juicer.core.data
# batched is required for fault-tolerant or batched OP
if callable(getattr(
called_func.__self__,
- 'is_batched_op')) and called_func.__self__.is_batched_op(
- ) or not getattr(called_func.__self__, 'turbo', False):
+ 'is_batched_op')) and called_func.__self__.is_batched_op():
kargs['batched'] = True
kargs['batch_size'] = kargs.pop('batch_size', 1)
+ elif not getattr(called_func.__self__, 'turbo', False):
+ kargs['batched'] = True
+ kargs['batch_size'] = 1
else:
kargs['batched'] = False
- # rank is required for cuda model loading
- if callable(
+ # rank is required for cuda model loading for map
+ if not is_filter and callable(
getattr(called_func.__self__,
'use_cuda')) and called_func.__self__.use_cuda():
kargs['with_rank'] = True
@@ -349,6 +370,17 @@ Source code for data_juicer.core.data
new_fingerprint = generate_fingerprint(self, *args, **kargs)
kargs['new_fingerprint'] = new_fingerprint
+ return args, kargs
+
+
+
+[docs]
+ def map(self, *args, **kargs):
+ """Override the map func, which is called by most common operations,
+ such that the processed samples can be accessed by nested manner."""
+
+ args, kargs = self.update_args(args, kargs)
+
if cache_utils.CACHE_COMPRESS:
decompress(self, kargs['new_fingerprint'],
kargs['num_proc'] if 'num_proc' in kargs else 1)
@@ -364,41 +396,13 @@ Source code for data_juicer.core.data
return new_ds
-[docs] def filter(self, *args, **kargs):
+
+
+[docs]
+ def filter(self, *args, **kargs):
"""Override the filter func, which is called by most common operations,
such that the processed samples can be accessed by nested manner."""
- if args:
- args = list(args)
- # the first positional para is function
- if args[0] is None:
- args[0] = lambda x: nested_obj_factory(x)
- else:
- args[0] = wrap_func_with_nested_access(args[0])
- called_func = args[0]
- else:
- if 'function' not in kargs or kargs['function'] is None:
- kargs['function'] = lambda x: nested_obj_factory(x)
- else:
- kargs['function'] = wrap_func_with_nested_access(
- kargs['function'])
- called_func = kargs['function']
-
- # For wrapped function, try to get its unwrapped (bound) method
- while not inspect.ismethod(called_func) and hasattr(
- called_func, '__wrapped__'):
- called_func = called_func.__wrapped__
-
- # Batched is always required for fault tolerance
- if inspect.ismethod(called_func):
- if callable(getattr(
- called_func.__self__,
- 'is_batched_op')) and called_func.__self__.is_batched_op():
- kargs['batched'] = True
- kargs['batch_size'] = kargs.pop('batch_size', 1)
-
- if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None:
- new_fingerprint = generate_fingerprint(self, *args, **kargs)
- kargs['new_fingerprint'] = new_fingerprint
+ args, kargs = self.update_args(args, kargs, is_filter=True)
# For filter, it involves a map and a filter operations, so the final
# cache files includes two sets with different fingerprint (before and
@@ -428,42 +432,65 @@ Source code for data_juicer.core.data
return new_ds
-[docs] def select(self, *args, **kargs):
+
+
+[docs]
+ def select(self, *args, **kargs):
"""Override the select func, such that selected samples can be accessed
by nested manner."""
return nested_obj_factory(super().select(*args, **kargs))
-[docs] @classmethod
+
+
+[docs]
+ @classmethod
def from_dict(cls, *args, **kargs):
"""Override the from_dict func, which is called by most from_xx
constructors, such that the constructed dataset object is
NestedDataset."""
return NestedDataset(super().from_dict(*args, **kargs))
-[docs] def add_column(self, *args, **kargs):
+
+
+[docs]
+ def add_column(self, *args, **kargs):
"""Override the add column func, such that the processed samples
can be accessed by nested manner."""
return NestedDataset(super().add_column(*args, **kargs))
-[docs] def select_columns(self, *args, **kargs):
+
+
+[docs]
+ def select_columns(self, *args, **kargs):
"""Override the select columns func, such that the processed samples
can be accessed by nested manner."""
return NestedDataset(super().select_columns(*args, **kargs))
-[docs] def remove_columns(self, *args, **kargs):
+
+
+[docs]
+ def remove_columns(self, *args, **kargs):
"""Override the remove columns func, such that the processed samples
can be accessed by nested manner."""
return NestedDataset(super().remove_columns(*args, **kargs))
-[docs] def cleanup_cache_files(self):
+
+
+[docs]
+ def cleanup_cache_files(self):
"""Override the cleanup_cache_files func, clear raw and compressed
cache files."""
cleanup_compressed_cache_files(self)
return super().cleanup_cache_files()
-[docs] @staticmethod
+
+
+[docs]
+ @staticmethod
def load_from_disk(*args, **kargs):
- return NestedDataset(Dataset.load_from_disk(*args, **kargs))
+ return NestedDataset(Dataset.load_from_disk(*args, **kargs))
+
+
def nested_query(root_obj: Union[NestedDatasetDict, NestedDataset,
diff --git a/_modules/data_juicer/core/executor.html b/_modules/data_juicer/core/executor.html
index 60d797f54..e495e8e78 100644
--- a/_modules/data_juicer/core/executor.html
+++ b/_modules/data_juicer/core/executor.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.core.executor — data_juicer 1.0.0 documentation
+ data_juicer.core.executor — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -103,7 +103,9 @@ Source code for data_juicer.core.executor
from .tracer import Tracer
-[docs]class Executor:
+
+[docs]
+class Executor:
"""
This Executor class is used to process a specific dataset.
@@ -111,7 +113,9 @@ Source code for data_juicer.core.executor
ops in the config file in order and generate a processed dataset.
"""
-[docs] def __init__(self, cfg: Optional[Namespace] = None):
+
+[docs]
+ def __init__(self, cfg: Optional[Namespace] = None):
"""
Initialization method.
@@ -175,7 +179,10 @@ Source code for data_juicer.core.executor
logger.info('Trace for all ops.')
self.op_list_to_trace = set(OPERATORS.modules.keys())
-[docs] def sample_data(self,
+
+
+[docs]
+ def sample_data(self,
dataset_to_sample: Dataset = None,
load_data_np=None,
sample_ratio: float = 1.0,
@@ -221,7 +228,10 @@ Source code for data_juicer.core.executor
else:
raise ValueError(f'Unsupported sample_algo: {sample_algo}')
-[docs] def run(self,
+
+
+[docs]
+ def run(self,
load_data_np: Optional[PositiveInt] = None,
skip_return=False):
"""
@@ -272,11 +282,14 @@ Source code for data_juicer.core.executor
# - If checkpoint is open, clean the cache files after each process
logger.info('Processing data...')
tstart = time()
- dataset = dataset.process(ops,
- work_dir=self.work_dir,
- exporter=self.exporter,
- checkpointer=self.ckpt_manager,
- tracer=self.tracer)
+ dataset = dataset.process(
+ ops,
+ work_dir=self.work_dir,
+ exporter=self.exporter,
+ checkpointer=self.ckpt_manager,
+ tracer=self.tracer,
+ open_monitor=self.cfg.open_monitor,
+ )
tend = time()
logger.info(f'All OPs are done in {tend - tstart:.3f}s.')
@@ -289,7 +302,9 @@ Source code for data_juicer.core.executor
compress(dataset)
if not skip_return:
- return dataset
+ return dataset
+
+
diff --git a/_modules/data_juicer/core/exporter.html b/_modules/data_juicer/core/exporter.html
index 0936f6c97..6d7b5fc06 100644
--- a/_modules/data_juicer/core/exporter.html
+++ b/_modules/data_juicer/core/exporter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.core.exporter — data_juicer 1.0.0 documentation
+ data_juicer.core.exporter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -85,7 +85,9 @@ Source code for data_juicer.core.exporter
from data_juicer.utils.constant import Fields, HashKeys
-[docs]class Exporter:
+
+[docs]
+class Exporter:
"""The Exporter class is used to export a dataset to files of specific
format."""
@@ -94,7 +96,9 @@ Source code for data_juicer.core.exporter
GiB = 2**30 # 1024*1024*1024
TiB = 2**40 # 1024*1024*1024*1024
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
export_path,
export_shard_size=0,
export_in_parallel=True,
@@ -157,6 +161,7 @@ Source code for data_juicer.core.exporter
f'single shard file and make loading and exporting '
f'slower.')
+
def _get_suffix(self, export_path):
"""
Get the suffix of export path and check if it's supported.
@@ -267,7 +272,9 @@ Source code for data_juicer.core.exporter
pool.close()
pool.join()
-[docs] def export(self, dataset):
+
+[docs]
+ def export(self, dataset):
"""
Export method for a dataset.
@@ -277,7 +284,10 @@ Source code for data_juicer.core.exporter
self._export_impl(dataset, self.export_path, self.suffix,
self.export_stats)
-[docs] def export_compute_stats(self, dataset, export_path):
+
+
+[docs]
+ def export_compute_stats(self, dataset, export_path):
"""
Export method for saving compute status in filters
"""
@@ -289,7 +299,10 @@ Source code for data_juicer.core.exporter
export_stats=False)
self.keep_stats_in_res_ds = keep_stats_in_res_ds
-[docs] @staticmethod
+
+
+[docs]
+ @staticmethod
def to_jsonl(dataset, export_path, num_proc=1, **kwargs):
"""
Export method for jsonl target files.
@@ -302,7 +315,10 @@ Source code for data_juicer.core.exporter
"""
dataset.to_json(export_path, force_ascii=False, num_proc=num_proc)
-[docs] @staticmethod
+
+
+[docs]
+ @staticmethod
def to_json(dataset, export_path, num_proc=1, **kwargs):
"""
Export method for json target files.
@@ -318,7 +334,10 @@ Source code for data_juicer.core.exporter
num_proc=num_proc,
lines=False)
-[docs] @staticmethod
+
+
+[docs]
+ @staticmethod
def to_parquet(dataset, export_path, **kwargs):
"""
Export method for parquet target files.
@@ -330,6 +349,7 @@ Source code for data_juicer.core.exporter
"""
dataset.to_parquet(export_path)
+
# suffix to export method
@staticmethod
def _router():
@@ -343,6 +363,7 @@ Source code for data_juicer.core.exporter
'json': Exporter.to_json,
'parquet': Exporter.to_parquet,
}
+
diff --git a/_modules/data_juicer/core/monitor.html b/_modules/data_juicer/core/monitor.html
index 95abcb3c3..9736354cd 100644
--- a/_modules/data_juicer/core/monitor.html
+++ b/_modules/data_juicer/core/monitor.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.core.monitor — data_juicer 1.0.0 documentation
+ data_juicer.core.monitor — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -99,7 +99,9 @@ Source code for data_juicer.core.monitor
mdict['resource'] = this_states
-[docs]class Monitor:
+
+[docs]
+class Monitor:
"""
Monitor resource utilization and other information during the data
processing.
@@ -157,17 +159,25 @@ Source code for data_juicer.core.monitor
'GPU util.',
}
-[docs] def __init__(self):
+
-[docs] def monitor_all_resources(self):
+
+
+[docs]
+ def monitor_all_resources(self):
"""
Detect the resource utilization of all distributed nodes.
"""
# TODO
raise NotImplementedError
-[docs] @staticmethod
+
+
+[docs]
+ @staticmethod
def monitor_current_resources():
"""
Detect the resource utilization of the current environment/machine.
@@ -200,7 +210,10 @@ Source code for data_juicer.core.monitor
return resource_dict
-[docs] @staticmethod
+
+
+[docs]
+ @staticmethod
def draw_resource_util_graph(resource_util_list, store_dir):
import matplotlib.pyplot as plt
for idx, resource_util_dict in enumerate(resource_util_list):
@@ -218,7 +231,10 @@ Source code for data_juicer.core.monitor
plt.savefig(os.path.join(store_dir, fn), bbox_inches='tight')
plt.clf()
-[docs] @staticmethod
+
+
+[docs]
+ @staticmethod
def analyze_resource_util_list(resource_util_list):
"""
Analyze the resource utilization for a given resource util list.
@@ -229,7 +245,10 @@ Source code for data_juicer.core.monitor
res_list.append(Monitor.analyze_single_resource_util(item))
return res_list
-[docs] @staticmethod
+
+
+[docs]
+ @staticmethod
def analyze_single_resource_util(resource_util_dict):
"""
Analyze the resource utilization for a single resource util dict.
@@ -258,7 +277,10 @@ Source code for data_juicer.core.monitor
return resource_util_dict
-[docs] @staticmethod
+
+
+[docs]
+ @staticmethod
def monitor_func(func, args=None, sample_interval=0.5):
"""
Process the input dataset and probe related information for each OP in
@@ -284,7 +306,10 @@ Source code for data_juicer.core.monitor
resource_util_dict = {}
# start monitor
- ctx = get_context('fork')
+ start_method = 'fork'
+ if os.name == 'nt': # for Windows
+ start_method = 'spawn'
+ ctx = get_context(start_method)
with ctx.Manager() as manager:
mdict = manager.dict()
mdict['stop'] = False
@@ -315,7 +340,9 @@ Source code for data_juicer.core.monitor
# calculate speed
resource_util_dict['time'] = end - start
- return ret, resource_util_dict
+ return ret, resource_util_dict
+
+
diff --git a/_modules/data_juicer/core/tracer.html b/_modules/data_juicer/core/tracer.html
index b2cede0c6..ca242ffc9 100644
--- a/_modules/data_juicer/core/tracer.html
+++ b/_modules/data_juicer/core/tracer.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.core.tracer — data_juicer 1.0.0 documentation
+ data_juicer.core.tracer — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -84,7 +84,9 @@ Source code for data_juicer.core.tracer
from loguru import logger
-[docs]class Tracer:
+
+[docs]
+class Tracer:
"""
The tracer to trace the sample changes before and after an operator
process.
@@ -92,7 +94,9 @@ Source code for data_juicer.core.tracer
The comparison results will be stored in the work directory.
"""
-[docs] def __init__(self, work_dir, show_num=10):
+
+[docs]
+ def __init__(self, work_dir, show_num=10):
"""
Initialization method.
@@ -106,7 +110,10 @@ Source code for data_juicer.core.tracer
os.makedirs(self.work_dir)
self.show_num = show_num
-[docs] def trace_mapper(self, op_name: str, previous_ds: Dataset,
+
+
+[docs]
+ def trace_mapper(self, op_name: str, previous_ds: Dataset,
processed_ds: Dataset, text_key: str):
"""
Compare datasets before and after a Mapper.
@@ -156,7 +163,10 @@ Source code for data_juicer.core.tracer
lines=True,
force_ascii=False)
-[docs] def trace_batch_mapper(self, op_name: str, previous_ds: Dataset,
+
+
+[docs]
+ def trace_batch_mapper(self, op_name: str, previous_ds: Dataset,
processed_ds: Dataset, text_key: str):
"""
Compare datasets before and after a BatchMapper.
@@ -196,7 +206,10 @@ Source code for data_juicer.core.tracer
lines=True,
force_ascii=False)
-[docs] def trace_filter(self, op_name: str, previous_ds: Dataset,
+
+
+[docs]
+ def trace_filter(self, op_name: str, previous_ds: Dataset,
processed_ds: Dataset):
"""
Compare datasets before and after a Filter.
@@ -256,7 +269,10 @@ Source code for data_juicer.core.tracer
lines=True,
force_ascii=False)
-[docs] def trace_deduplicator(self, op_name: str, dup_pairs: list):
+
+
+[docs]
+ def trace_deduplicator(self, op_name: str, dup_pairs: list):
"""
Compare datasets before and after a Deduplicator.
@@ -300,7 +316,9 @@ Source code for data_juicer.core.tracer
dup_df.to_json(os.path.join(self.work_dir, res_name),
orient='records',
lines=True,
- force_ascii=False)
+ force_ascii=False)
+
+
diff --git a/_modules/data_juicer/format/csv_formatter.html b/_modules/data_juicer/format/csv_formatter.html
index 8c74645d7..18059e900 100644
--- a/_modules/data_juicer/format/csv_formatter.html
+++ b/_modules/data_juicer/format/csv_formatter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.format.csv_formatter — data_juicer 1.0.0 documentation
+ data_juicer.format.csv_formatter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -80,7 +80,9 @@ Source code for data_juicer.format.csv_formatter
from .formatter import FORMATTERS, LocalFormatter
-[docs]@FORMATTERS.register_module()
+
+[docs]
+@FORMATTERS.register_module()
class CsvFormatter(LocalFormatter):
"""
The class is used to load and format csv-type files.
@@ -89,7 +91,9 @@ Source code for data_juicer.format.csv_formatter
"""
SUFFIXES = ['.csv']
-[docs] def __init__(self, dataset_path, suffixes=None, **kwargs):
+
+[docs]
+ def __init__(self, dataset_path, suffixes=None, **kwargs):
"""
Initialization method.
@@ -102,7 +106,9 @@ Source code for data_juicer.format.csv_formatter
suffixes=suffixes if suffixes else self.SUFFIXES,
type='csv',
**kwargs,
- )
+ )
+
+
diff --git a/_modules/data_juicer/format/empty_formatter.html b/_modules/data_juicer/format/empty_formatter.html
index 4affd7e1b..235ceabdf 100644
--- a/_modules/data_juicer/format/empty_formatter.html
+++ b/_modules/data_juicer/format/empty_formatter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.format.empty_formatter — data_juicer 1.0.0 documentation
+ data_juicer.format.empty_formatter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -89,14 +89,18 @@ Source code for data_juicer.format.empty_formatter
ray = LazyLoader('ray', 'ray')
-[docs]@FORMATTERS.register_module()
+
+[docs]
+@FORMATTERS.register_module()
class EmptyFormatter(BaseFormatter):
"""
The class is used to create empty data.
"""
SUFFIXES = []
-[docs] def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
+
+[docs]
+ def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
"""
Initialization method.
@@ -108,11 +112,14 @@ Source code for data_juicer.format.empty_formatter
if isinstance(self.feature_keys, str):
self.feature_keys = [self.feature_keys]
+
@property
def null_value(self):
return None
-[docs] def load_dataset(self, *args, **kwargs):
+
+[docs]
+ def load_dataset(self, *args, **kwargs):
data_dict = {}
features = Features()
@@ -126,17 +133,23 @@ Source code for data_juicer.format.empty_formatter
from data_juicer.core.data import NestedDataset
empty_dataset = NestedDataset(empty_dataset)
- return empty_dataset
+ return empty_dataset
+
+
-[docs]@FORMATTERS.register_module()
+
+[docs]
+@FORMATTERS.register_module()
class RayEmptyFormatter(BaseFormatter):
"""
The class is used to create empty data for ray.
"""
SUFFIXES = []
-[docs] def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
+
+[docs]
+ def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
"""
Initialization method.
@@ -148,11 +161,14 @@ Source code for data_juicer.format.empty_formatter
if isinstance(self.feature_keys, str):
self.feature_keys = [self.feature_keys]
+
@property
def null_value(self):
return {}
-[docs] def load_dataset(self, *args, **kwargs):
+
+[docs]
+ def load_dataset(self, *args, **kwargs):
if len(self.feature_keys):
df = pd.DataFrame({
col: [self.null_value for _ in range(self.length)]
@@ -163,7 +179,9 @@ Source code for data_juicer.format.empty_formatter
empty_dataset = ray.data.from_pandas(df)
- return empty_dataset
+ return empty_dataset
+
+
diff --git a/_modules/data_juicer/format/formatter.html b/_modules/data_juicer/format/formatter.html
index 0ba0848c1..5ac386117 100644
--- a/_modules/data_juicer/format/formatter.html
+++ b/_modules/data_juicer/format/formatter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.format.formatter — data_juicer 1.0.0 documentation
+ data_juicer.format.formatter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -98,11 +98,15 @@ Source code for data_juicer.format.formatter
raise NotImplementedError
-[docs]class LocalFormatter(BaseFormatter):
+
+[docs]
+class LocalFormatter(BaseFormatter):
"""The class is used to load a dataset from local files or local
directory."""
-[docs] def __init__(
+
+[docs]
+ def __init__(
self,
dataset_path: str,
type: str,
@@ -130,7 +134,10 @@ Source code for data_juicer.format.formatter
self.data_files = find_files_with_suffix(dataset_path, suffixes)
self.add_suffix = add_suffix
-[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
+
+
+[docs]
+ def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a dataset from dataset file or dataset directory, and unify its
format.
@@ -157,14 +164,20 @@ Source code for data_juicer.format.formatter
text_keys=self.text_keys,
num_proc=num_proc,
global_cfg=global_cfg)
- return ds
+ return ds
+
+
-[docs]class RemoteFormatter(BaseFormatter):
+
+[docs]
+class RemoteFormatter(BaseFormatter):
"""The class is used to load a dataset from repository of huggingface
hub."""
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
dataset_path: str,
text_keys: List[str] = None,
**kwargs):
@@ -180,7 +193,10 @@ Source code for data_juicer.format.formatter
self.text_keys = text_keys
self.kwargs = kwargs
-[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
+
+
+[docs]
+ def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a dataset from HuggingFace, and unify its format.
@@ -196,7 +212,9 @@ Source code for data_juicer.format.formatter
text_keys=self.text_keys,
num_proc=num_proc,
global_cfg=global_cfg)
- return ds
+ return ds
+
+
def add_suffixes(datasets: DatasetDict, num_proc: int = 1) -> Dataset:
diff --git a/_modules/data_juicer/format/json_formatter.html b/_modules/data_juicer/format/json_formatter.html
index 9300a3e39..98dc91960 100644
--- a/_modules/data_juicer/format/json_formatter.html
+++ b/_modules/data_juicer/format/json_formatter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.format.json_formatter — data_juicer 1.0.0 documentation
+ data_juicer.format.json_formatter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -80,7 +80,9 @@ Source code for data_juicer.format.json_formatter
from .formatter import FORMATTERS, LocalFormatter
-[docs]@FORMATTERS.register_module()
+
+[docs]
+@FORMATTERS.register_module()
class JsonFormatter(LocalFormatter):
"""
The class is used to load and format json-type files.
@@ -89,7 +91,9 @@ Source code for data_juicer.format.json_formatter
"""
SUFFIXES = ['.json', '.jsonl', '.jsonl.zst']
-[docs] def __init__(self, dataset_path, suffixes=None, **kwargs):
+
+[docs]
+ def __init__(self, dataset_path, suffixes=None, **kwargs):
"""
Initialization method.
@@ -102,7 +106,9 @@ Source code for data_juicer.format.json_formatter
suffixes=suffixes if suffixes else self.SUFFIXES,
type='json',
**kwargs,
- )
+ )
+
+
diff --git a/_modules/data_juicer/format/load.html b/_modules/data_juicer/format/load.html
index 4c392474b..f947568be 100644
--- a/_modules/data_juicer/format/load.html
+++ b/_modules/data_juicer/format/load.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.format.load — data_juicer 1.0.0 documentation
+ data_juicer.format.load — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -81,7 +81,9 @@ Source code for data_juicer.format.load
from .mixture_formatter import MixtureFormatter
-[docs]def load_formatter(dataset_path,
+
+[docs]
+def load_formatter(dataset_path,
generated_dataset_config=None,
text_keys=None,
suffixes=[],
@@ -118,6 +120,7 @@ Source code for data_juicer.format.load
add_suffix=add_suffix,
**kwargs)
return formatter
+
diff --git a/_modules/data_juicer/format/mixture_formatter.html b/_modules/data_juicer/format/mixture_formatter.html
index 4236fcd32..ef735a90f 100644
--- a/_modules/data_juicer/format/mixture_formatter.html
+++ b/_modules/data_juicer/format/mixture_formatter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.format.mixture_formatter — data_juicer 1.0.0 documentation
+ data_juicer.format.mixture_formatter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -87,12 +87,16 @@ Source code for data_juicer.format.mixture_formatter
from .formatter import BaseFormatter, load_formatter
-[docs]class MixtureFormatter(BaseFormatter):
+
+[docs]
+class MixtureFormatter(BaseFormatter):
"""The class mixes multiple datasets by randomly selecting samples from
every dataset and merging them, and then exports the merged datasset as a
new mixed dataset."""
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
dataset_path: str,
suffixes: Union[str, List[str], None] = None,
text_keys=None,
@@ -142,6 +146,7 @@ Source code for data_juicer.format.mixture_formatter
**kwargs) for data_prefix in data_prefixes
]
+
def _get_weight(self, data_prefix):
"""
Split every dataset path and its weight.
@@ -167,7 +172,9 @@ Source code for data_juicer.format.mixture_formatter
prefixes.append(value)
return prefixes, weights
-[docs] @classmethod
+
+[docs]
+ @classmethod
def random_sample(cls, dataset, weight=1.0, sample_number=0, seed=None):
"""
Randomly sample a subset from a dataset with weight or number,
@@ -199,7 +206,10 @@ Source code for data_juicer.format.mixture_formatter
return dataset.shuffle(seed=seed).select(sample_index)
-[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
+
+
+[docs]
+ def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a mixed dataset.
@@ -220,7 +230,9 @@ Source code for data_juicer.format.mixture_formatter
from data_juicer.core.data import NestedDataset
mixed_dataset = NestedDataset(concatenate_datasets(dataset_list))
logger.info(f'There are {len(mixed_dataset)} in final dataset')
- return mixed_dataset
+ return mixed_dataset
+
+
diff --git a/_modules/data_juicer/format/parquet_formatter.html b/_modules/data_juicer/format/parquet_formatter.html
index 39c8b1b3a..021c82cf6 100644
--- a/_modules/data_juicer/format/parquet_formatter.html
+++ b/_modules/data_juicer/format/parquet_formatter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.format.parquet_formatter — data_juicer 1.0.0 documentation
+ data_juicer.format.parquet_formatter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -80,7 +80,9 @@ Source code for data_juicer.format.parquet_formatter
from .formatter import FORMATTERS, LocalFormatter
-[docs]@FORMATTERS.register_module()
+
+[docs]
+@FORMATTERS.register_module()
class ParquetFormatter(LocalFormatter):
"""
The class is used to load and format parquet-type files.
@@ -89,7 +91,9 @@ Source code for data_juicer.format.parquet_formatter
"""
SUFFIXES = ['.parquet']
-[docs] def __init__(self, dataset_path, suffixes=None, **kwargs):
+
+[docs]
+ def __init__(self, dataset_path, suffixes=None, **kwargs):
"""
Initialization method.
@@ -102,7 +106,9 @@ Source code for data_juicer.format.parquet_formatter
suffixes=suffixes if suffixes else self.SUFFIXES,
type='parquet',
**kwargs,
- )
+ )
+
+
diff --git a/_modules/data_juicer/format/text_formatter.html b/_modules/data_juicer/format/text_formatter.html
index 46820a783..ee70b89b9 100644
--- a/_modules/data_juicer/format/text_formatter.html
+++ b/_modules/data_juicer/format/text_formatter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.format.text_formatter — data_juicer 1.0.0 documentation
+ data_juicer.format.text_formatter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -131,7 +131,9 @@ Source code for data_juicer.format.text_formatter
f.write('\n'.join(text))
-[docs]@FORMATTERS.register_module()
+
+[docs]
+@FORMATTERS.register_module()
class TextFormatter(LocalFormatter):
"""
The class is used to load and format text-type files.
@@ -151,7 +153,9 @@ Source code for data_juicer.format.text_formatter
'.m', '.smali'
]
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
dataset_path,
suffixes=None,
add_suffix=False,
@@ -175,7 +179,10 @@ Source code for data_juicer.format.text_formatter
self.dataset_path = dataset_path
self.add_suffix = add_suffix
-[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
+
+
+[docs]
+ def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a dataset from local text-type files.
@@ -235,7 +242,9 @@ Source code for data_juicer.format.text_formatter
return unify_format(datasets,
text_keys=self.text_keys,
num_proc=num_proc,
- global_cfg=global_cfg)
+ global_cfg=global_cfg)
+
+
diff --git a/_modules/data_juicer/format/tsv_formatter.html b/_modules/data_juicer/format/tsv_formatter.html
index 0e7e22baa..70f13b609 100644
--- a/_modules/data_juicer/format/tsv_formatter.html
+++ b/_modules/data_juicer/format/tsv_formatter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.format.tsv_formatter — data_juicer 1.0.0 documentation
+ data_juicer.format.tsv_formatter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -80,7 +80,9 @@ Source code for data_juicer.format.tsv_formatter
from .formatter import FORMATTERS, LocalFormatter
-[docs]@FORMATTERS.register_module()
+
+[docs]
+@FORMATTERS.register_module()
class TsvFormatter(LocalFormatter):
"""
The class is used to load and format tsv-type files.
@@ -89,7 +91,9 @@ Source code for data_juicer.format.tsv_formatter
"""
SUFFIXES = ['.tsv']
-
+
+
diff --git a/_modules/data_juicer/ops/base_op.html b/_modules/data_juicer/ops/base_op.html
index 23cdbd2d8..cd8a40f1e 100644
--- a/_modules/data_juicer/ops/base_op.html
+++ b/_modules/data_juicer/ops/base_op.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.base_op — data_juicer 1.0.0 documentation
+ data_juicer.ops.base_op — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -149,7 +149,7 @@ Source code for data_juicer.ops.base_op
return wrapper
-def catch_map_single_exception(method):
+def catch_map_single_exception(method, return_sample=True):
"""
For single-map sample-level fault tolerance.
The input sample is expected batch_size = 1.
@@ -171,8 +171,11 @@ Source code for data_juicer.ops.base_op
if is_batched(sample):
try:
sample = convert_dict_list_to_list_dict(sample)[0]
- res_sample = method(sample, *args, **kwargs)
- return convert_list_dict_to_dict_list([res_sample])
+ res = method(sample, *args, **kwargs)
+ if return_sample:
+ return convert_list_dict_to_dict_list([res])
+ else:
+ return [res]
except Exception as e:
from loguru import logger
logger.error(
@@ -245,9 +248,8 @@ Source code for data_juicer.ops.base_op
method = wrap_func_with_nested_access(method)
setattr(self, name, method)
- @classmethod
- def is_batched_op(cls):
- return cls._batched_op
+ def is_batched_op(self):
+ return self._batched_op
def process(self, *args, **kwargs):
raise NotImplementedError
@@ -299,9 +301,13 @@ Source code for data_juicer.ops.base_op
return np.empty((0, 0), dtype=str)
-[docs]class Mapper(OP):
+
+[docs]
+class Mapper(OP):
-[docs] def __init__(self, *args, **kwargs):
+
+[docs]
+ def __init__(self, *args, **kwargs):
"""
Base class that conducts data editing.
@@ -322,6 +328,7 @@ Source code for data_juicer.ops.base_op
else:
self.process = catch_map_single_exception(self.process_single)
+
# set the process method is not allowed to be overridden
def __init_subclass__(cls, **kwargs):
not_allowed_list = ['process']
@@ -332,7 +339,9 @@ Source code for data_juicer.ops.base_op
f'{cls.__name__}. Please implement {method_name}_single '
f'or {method_name}_batched.')
-[docs] def process_batched(self, samples, *args, **kwargs):
+
+[docs]
+ def process_batched(self, samples, *args, **kwargs):
keys = samples.keys()
first_key = next(iter(keys))
num_samples = len(samples[first_key])
@@ -344,7 +353,10 @@ Source code for data_juicer.ops.base_op
return samples
-[docs] def process_single(self, sample):
+
+
+[docs]
+ def process_single(self, sample):
"""
For sample level, sample --> sample
@@ -353,7 +365,10 @@ Source code for data_juicer.ops.base_op
"""
raise NotImplementedError
-[docs] def run(self, dataset, *, exporter=None, tracer=None):
+
+
+[docs]
+ def run(self, dataset, *, exporter=None, tracer=None):
dataset = super(Mapper, self).run(dataset)
new_dataset = dataset.map(
self.process,
@@ -365,12 +380,18 @@ Source code for data_juicer.ops.base_op
if tracer:
tracer.trace_mapper(self._name, dataset, new_dataset,
self.text_key)
- return new_dataset
+ return new_dataset
+
+
-[docs]class Filter(OP):
+
+[docs]
+class Filter(OP):
-[docs] def __init__(self, *args, **kwargs):
+
+[docs]
+ def __init__(self, *args, **kwargs):
"""
Base class that removes specific info.
@@ -394,7 +415,9 @@ Source code for data_juicer.ops.base_op
else:
self.compute_stats = catch_map_single_exception(
self.compute_stats_single)
- self.process = catch_map_single_exception(self.process_single)
+ self.process = catch_map_single_exception(self.process_single,
+ return_sample=False)
+
# set the process method is not allowed to be overridden
def __init_subclass__(cls, **kwargs):
@@ -406,7 +429,9 @@ Source code for data_juicer.ops.base_op
f'{cls.__name__}. Please implement {method_name}_single '
f'or {method_name}_batched.')
-[docs] def compute_stats_batched(self, samples, *args, **kwargs):
+
+[docs]
+ def compute_stats_batched(self, samples, *args, **kwargs):
keys = samples.keys()
num_samples = len(samples[Fields.stats])
for i in range(num_samples):
@@ -419,11 +444,17 @@ Source code for data_juicer.ops.base_op
return samples
-[docs] def process_batched(self, samples):
+
+
+[docs]
+ def process_batched(self, samples):
return map(lambda stat: self.process_single({Fields.stats: stat}),
samples[Fields.stats])
-[docs] def compute_stats_single(self, sample, context=False):
+
+
+[docs]
+ def compute_stats_single(self, sample, context=False):
"""
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -435,7 +466,10 @@ Source code for data_juicer.ops.base_op
"""
raise NotImplementedError
-[docs] def process_single(self, sample):
+
+
+[docs]
+ def process_single(self, sample):
"""
For sample level, sample --> Boolean.
@@ -444,7 +478,10 @@ Source code for data_juicer.ops.base_op
"""
raise NotImplementedError
-[docs] def run(self, dataset, *, exporter=None, tracer=None, reduce=True):
+
+
+[docs]
+ def run(self, dataset, *, exporter=None, tracer=None, reduce=True):
dataset = super(Filter, self).run(dataset)
if Fields.stats not in dataset.features:
from data_juicer.core.data import add_same_content_to_new_column
@@ -472,12 +509,18 @@ Source code for data_juicer.ops.base_op
tracer.trace_filter(self._name, dataset, new_dataset)
return new_dataset
else:
- return dataset
+ return dataset
+
+
-[docs]class Deduplicator(OP):
+
+[docs]
+class Deduplicator(OP):
-[docs] def __init__(self, *args, **kwargs):
+
+[docs]
+ def __init__(self, *args, **kwargs):
"""
Base class that conducts deduplication.
@@ -498,7 +541,10 @@ Source code for data_juicer.ops.base_op
else:
self.compute_hash = catch_map_single_exception(self.compute_hash)
-[docs] def compute_hash(self, sample):
+
+
+[docs]
+ def compute_hash(self, sample):
"""
Compute hash values for the sample.
@@ -507,7 +553,10 @@ Source code for data_juicer.ops.base_op
"""
raise NotImplementedError
-[docs] def process(self, dataset, show_num=0):
+
+
+[docs]
+ def process(self, dataset, show_num=0):
"""
For doc-level, dataset --> dataset.
@@ -518,7 +567,10 @@ Source code for data_juicer.ops.base_op
"""
raise NotImplementedError
-[docs] def run(self, dataset, *, exporter=None, tracer=None, reduce=True):
+
+
+[docs]
+ def run(self, dataset, *, exporter=None, tracer=None, reduce=True):
dataset = super(Deduplicator, self).run(dataset)
dataset = dataset.map(self.compute_hash,
num_proc=self.runtime_np(),
@@ -531,12 +583,18 @@ Source code for data_juicer.ops.base_op
tracer.trace_deduplicator(self._name, dup_pairs)
return new_dataset
else:
- return dataset
+ return dataset
+
-[docs]class Selector(OP):
-[docs] def __init__(self, *args, **kwargs):
+
+[docs]
+class Selector(OP):
+
+
+[docs]
+ def __init__(self, *args, **kwargs):
"""
Base class that conducts selection in dataset-level.
@@ -551,7 +609,10 @@ Source code for data_juicer.ops.base_op
"""
super(Selector, self).__init__(*args, **kwargs)
-[docs] def process(self, dataset):
+
+
+[docs]
+ def process(self, dataset):
"""
Dataset --> dataset.
@@ -560,12 +621,17 @@ Source code for data_juicer.ops.base_op
"""
raise NotImplementedError
-[docs] def run(self, dataset, *, exporter=None, tracer=None):
+
+
+[docs]
+ def run(self, dataset, *, exporter=None, tracer=None):
dataset = super(Selector, self).run(dataset)
new_dataset = self.process(dataset)
if tracer:
tracer.trace_filter(self._name, dataset, new_dataset)
- return new_dataset
+ return new_dataset
+
+
diff --git a/_modules/data_juicer/ops/common/helper_func.html b/_modules/data_juicer/ops/common/helper_func.html
index 0d730be3f..4040d75fa 100644
--- a/_modules/data_juicer/ops/common/helper_func.html
+++ b/_modules/data_juicer/ops/common/helper_func.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.common.helper_func — data_juicer 1.0.0 documentation
+ data_juicer.ops.common.helper_func — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -104,7 +104,9 @@ Source code for data_juicer.ops.common.helper_func
self.parent[px] = self.parent[py] = min(px, py)
-[docs]def strip(document, strip_characters):
+
+[docs]
+def strip(document, strip_characters):
"""
Way faster than document.strip(strip_characters) since strip_characters is
now a set instead of a str, and it contains a lot of elements (all the
@@ -132,7 +134,10 @@ Source code for data_juicer.ops.common.helper_func
return document_stripped
-[docs]def split_on_whitespace(document, new_line=False, tab=False):
+
+
+[docs]
+def split_on_whitespace(document, new_line=False, tab=False):
"""
This method also removes concatenated spaces.
@@ -148,7 +153,10 @@ Source code for data_juicer.ops.common.helper_func
return split_document
-[docs]def split_on_newline_tab_whitespace(document):
+
+
+[docs]
+def split_on_newline_tab_whitespace(document):
"""
This method is used to split the document into different levels of sub-
sentences.
@@ -165,7 +173,10 @@ Source code for data_juicer.ops.common.helper_func
return sentences
-[docs]def merge_on_whitespace_tab_newline(sentences):
+
+
+[docs]
+def merge_on_whitespace_tab_newline(sentences):
"""
This method is used to merge different levels of sub-sentences into one
document. Invert the method split_on_newline_tab_whitespace. Removes
@@ -184,7 +195,10 @@ Source code for data_juicer.ops.common.helper_func
return document
-[docs]def words_augmentation(words, group_size, join_char):
+
+
+[docs]
+def words_augmentation(words, group_size, join_char):
"""
Augment words, especially for Chinese (without a space between words) and
Vietnamese (with a space between syllables).
@@ -201,7 +215,10 @@ Source code for data_juicer.ops.common.helper_func
return augmentation
-[docs]def get_words_from_document(
+
+
+[docs]
+def get_words_from_document(
document,
token_func=None,
new_line=True,
@@ -225,7 +242,10 @@ Source code for data_juicer.ops.common.helper_func
return words
-[docs]def words_refinement(words,
+
+
+[docs]
+def words_refinement(words,
lower_case=False,
strip_chars=None,
use_words_aug=False,
@@ -262,7 +282,10 @@ Source code for data_juicer.ops.common.helper_func
return words
-[docs]def get_sentences_from_document(document, model_func=None):
+
+
+[docs]
+def get_sentences_from_document(document, model_func=None):
"""
Get sentences from a document.
@@ -279,7 +302,10 @@ Source code for data_juicer.ops.common.helper_func
return '\n'.join(sentences)
-[docs]def split_text_by_punctuation(text):
+
+
+[docs]
+def split_text_by_punctuation(text):
"""
Split text by any zh and en punctuation
@@ -293,6 +319,7 @@ Source code for data_juicer.ops.common.helper_func
result = [s.strip() for s in result if s.strip()]
return result
+
diff --git a/_modules/data_juicer/ops/deduplicator/document_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_deduplicator.html
index 2dd630459..71cdeef00 100644
--- a/_modules/data_juicer/ops/deduplicator/document_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/document_deduplicator.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.deduplicator.document_deduplicator — data_juicer 1.0.0 documentation
+ data_juicer.ops.deduplicator.document_deduplicator — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -93,7 +93,9 @@ Source code for data_juicer.ops.deduplicator.document_deduplicator
from ..base_op import OPERATORS, Deduplicator
-[docs]@OPERATORS.register_module('document_deduplicator')
+
+[docs]
+@OPERATORS.register_module('document_deduplicator')
class DocumentDeduplicator(Deduplicator):
"""
Deduplicator to deduplicate samples at document-level using exact matching.
@@ -101,7 +103,9 @@ Source code for data_juicer.ops.deduplicator.document_deduplicator
Using md5 hash to deduplicate samples.
"""
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
lowercase: bool = False,
ignore_non_character: bool = False,
*args,
@@ -121,7 +125,10 @@ Source code for data_juicer.ops.deduplicator.document_deduplicator
f'\s+|\d+|[{re.escape(string.punctuation)}]' # noqa: W605
) if ignore_non_character else None
-[docs] def compute_hash(self, sample):
+
+
+[docs]
+ def compute_hash(self, sample):
"""
Compute md5 hash values for the sample.
@@ -144,7 +151,10 @@ Source code for data_juicer.ops.deduplicator.document_deduplicator
sample[HashKeys.hash] = _get_hash(text)
return sample
-[docs] def process(self, dataset, show_num=0):
+
+
+[docs]
+ def process(self, dataset, show_num=0):
"""
For doc-level, dataset --> dataset.
@@ -188,7 +198,9 @@ Source code for data_juicer.ops.deduplicator.document_deduplicator
_filter_dup_helper,
fn_kwargs=dict(hashes=hashes),
load_from_cache_file=False if show_num > 0 else True) # num_proc=1
- return dataset, dup_pairs
+ return dataset, dup_pairs
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html
index ad84dd701..07f066f61 100644
--- a/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.deduplicator.document_minhash_deduplicator — data_juicer 1.0.0 documentation
+ data_juicer.ops.deduplicator.document_minhash_deduplicator — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -175,7 +175,9 @@ Source code for data_juicer.ops.deduplicator.document_minhash_deduplicator
return opt
-[docs]@OPERATORS.register_module(OP_NAME)
+
+[docs]
+@OPERATORS.register_module(OP_NAME)
class DocumentMinhashDeduplicator(Deduplicator):
"""
Deduplicator to deduplicate samples at document-level using MinHashLSH.
@@ -184,7 +186,9 @@ Source code for data_juicer.ops.deduplicator.document_minhash_deduplicator
kept in the final dataset.
"""
-[docs] def __init__(
+
+[docs]
+ def __init__(
self,
tokenization: str = 'space',
window_size: PositiveInt = 5,
@@ -283,7 +287,10 @@ Source code for data_juicer.ops.deduplicator.document_minhash_deduplicator
dtype=np.uint64,
).T
-[docs] def compute_hash(self, sample):
+
+
+[docs]
+ def compute_hash(self, sample):
"""
Compute minhash values for the sample.
@@ -347,7 +354,10 @@ Source code for data_juicer.ops.deduplicator.document_minhash_deduplicator
]
return sample
-[docs] def process(self, dataset, show_num=0):
+
+
+[docs]
+ def process(self, dataset, show_num=0):
"""
For doc-level, dataset --> dataset.
@@ -416,7 +426,9 @@ Source code for data_juicer.ops.deduplicator.document_minhash_deduplicator
)
logger.info(f'Keep {len(dataset)} samples after MinHash dedup.')
- return dataset, dup_pairs
+ return dataset, dup_pairs
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html
index 04358f166..0f686c89b 100644
--- a/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.deduplicator.document_simhash_deduplicator — data_juicer 1.0.0 documentation
+ data_juicer.ops.deduplicator.document_simhash_deduplicator — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -100,11 +100,15 @@ Source code for data_juicer.ops.deduplicator.document_simhash_deduplicator
OP_NAME = 'document_simhash_deduplicator'
-[docs]@OPERATORS.register_module(OP_NAME)
+
+[docs]
+@OPERATORS.register_module(OP_NAME)
class DocumentSimhashDeduplicator(Deduplicator):
"""Deduplicator to deduplicate samples at document-level using SimHash."""
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
tokenization: str = 'space',
window_size: PositiveInt = 6,
lowercase: bool = True,
@@ -153,7 +157,10 @@ Source code for data_juicer.ops.deduplicator.document_simhash_deduplicator
self.num_blocks = num_blocks
self.hamming_distance = hamming_distance
-[docs] def compute_hash(self, sample):
+
+
+[docs]
+ def compute_hash(self, sample):
"""
Compute simhash values for the sample.
@@ -199,7 +206,10 @@ Source code for data_juicer.ops.deduplicator.document_simhash_deduplicator
np.uint64(simhash.compute(map(simhash.unsigned_hash, tokens))))
return sample
-[docs] def process(self, dataset, show_num=0):
+
+
+[docs]
+ def process(self, dataset, show_num=0):
"""
For doc-level, dataset --> dataset.
@@ -302,7 +312,9 @@ Source code for data_juicer.ops.deduplicator.document_simhash_deduplicator
load_from_cache_file=False if show_num > 0 else True)
logger.info(f'Keep {len(dataset)} samples after SimHash dedup.')
- return dataset, dup_pairs
+ return dataset, dup_pairs
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/image_deduplicator.html b/_modules/data_juicer/ops/deduplicator/image_deduplicator.html
index 16c59e270..9c944d20d 100644
--- a/_modules/data_juicer/ops/deduplicator/image_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/image_deduplicator.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.deduplicator.image_deduplicator — data_juicer 1.0.0 documentation
+ data_juicer.ops.deduplicator.image_deduplicator — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -109,7 +109,9 @@ Source code for data_juicer.ops.deduplicator.image_deduplicator
return mapping[method_name]
-[docs]@OPERATORS.register_module(OP_NAME)
+
+[docs]
+@OPERATORS.register_module(OP_NAME)
@LOADED_IMAGES.register_module(OP_NAME)
class ImageDeduplicator(Deduplicator):
"""
@@ -117,7 +119,9 @@ Source code for data_juicer.ops.deduplicator.image_deduplicator
of images between documents.
"""
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
method: str = 'phash',
consider_text: bool = False,
*args,
@@ -141,7 +145,10 @@ Source code for data_juicer.ops.deduplicator.image_deduplicator
if self.consider_text:
self.text_dedup_op = DocumentDeduplicator(**kwargs)
-[docs] def compute_hash(self, sample, context=False):
+
+
+[docs]
+ def compute_hash(self, sample, context=False):
# get hash of text first
if self.consider_text:
sample = self.text_dedup_op.compute_hash(sample)
@@ -165,7 +172,10 @@ Source code for data_juicer.ops.deduplicator.image_deduplicator
image_array=np.array(images[key]))
return sample
-[docs] def process(self, dataset, show_num=0):
+
+
+[docs]
+ def process(self, dataset, show_num=0):
"""
For doc-level, dataset --> dataset.
@@ -221,7 +231,9 @@ Source code for data_juicer.ops.deduplicator.image_deduplicator
_filter_dup_helper,
fn_kwargs=dict(hashes=hashes),
load_from_cache_file=False if show_num > 0 else True) # num_proc=1
- return dataset, dup_pairs
+ return dataset, dup_pairs
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/ray_basic_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_basic_deduplicator.html
index 035151e1c..426a2bf24 100644
--- a/_modules/data_juicer/ops/deduplicator/ray_basic_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/ray_basic_deduplicator.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.deduplicator.ray_basic_deduplicator — data_juicer 1.0.0 documentation
+ data_juicer.ops.deduplicator.ray_basic_deduplicator — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -87,7 +87,9 @@ Source code for data_juicer.ops.deduplicator.ray_basic_deduplicator
redis = LazyLoader('redis', 'redis')
-[docs]class RayBasicDeduplicator(Filter):
+
+[docs]
+class RayBasicDeduplicator(Filter):
"""
A basic exact matching deduplicator for RAY.
Although its functionality is deduplication,
@@ -97,7 +99,9 @@ Source code for data_juicer.ops.deduplicator.ray_basic_deduplicator
# TODO: Set a more reasonable value
EMPTY_HASH_VALUE = 'EMPTY'
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
redis_host: str = 'localhost',
redis_port: PositiveInt = 6380,
*args,
@@ -117,11 +121,17 @@ Source code for data_juicer.ops.deduplicator.ray_basic_deduplicator
r = redis.StrictRedis(host=self.redis_host, port=self.redis_port, db=0)
r.flushdb(0)
-[docs] def calculate_hash(self, sample, context=False):
+
+
+[docs]
+ def calculate_hash(self, sample, context=False):
"""Calculate hash value for the sample."""
raise NotImplementedError
-[docs] def compute_stats_single(self, sample, context=False):
+
+
+[docs]
+ def compute_stats_single(self, sample, context=False):
# init redis client
r = redis.StrictRedis(host=self.redis_host, port=self.redis_port, db=0)
# compute hash
@@ -130,8 +140,13 @@ Source code for data_juicer.ops.deduplicator.ray_basic_deduplicator
sample[HashKeys.is_duplicate] = r.setnx(md5_value, 1)
return sample
-
+
+
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/ray_document_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_document_deduplicator.html
index 9e2835196..02edeaee7 100644
--- a/_modules/data_juicer/ops/deduplicator/ray_document_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/ray_document_deduplicator.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.deduplicator.ray_document_deduplicator — data_juicer 1.0.0 documentation
+ data_juicer.ops.deduplicator.ray_document_deduplicator — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -89,13 +89,17 @@ Source code for data_juicer.ops.deduplicator.ray_document_deduplicator
<
OP_NAME = 'ray_document_deduplicator'
-[docs]@OPERATORS.register_module(OP_NAME)
+
+[docs]
+@OPERATORS.register_module(OP_NAME)
class RayDocumentDeduplicator(RayBasicDeduplicator):
"""
Deduplicator to deduplicate samples at document-level using exact matching.
"""
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
redis_host: str = 'localhost',
redis_port: PositiveInt = 6380,
lowercase: bool = False,
@@ -121,7 +125,10 @@ Source code for data_juicer.ops.deduplicator.ray_document_deduplicator
<
f'\s+|\d+|[{re.escape(string.punctuation)}]' # noqa: W605
) if ignore_non_character else None
-[docs] def calculate_hash(self, sample, context=False):
+
+
+[docs]
+ def calculate_hash(self, sample, context=False):
if self.text_key not in sample or not sample[self.text_key]:
return RayBasicDeduplicator.EMPTY_HASH_VALUE
@@ -131,7 +138,9 @@ Source code for data_juicer.ops.deduplicator.ray_document_deduplicator
<
if self.remove_non_character_regex:
text = self.remove_non_character_regex.sub('', text)
- return hashlib.md5(text.strip().encode('utf-8')).hexdigest()
+ return hashlib.md5(text.strip().encode('utf-8')).hexdigest()
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/ray_image_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_image_deduplicator.html
index f637aca12..e8fab5a60 100644
--- a/_modules/data_juicer/ops/deduplicator/ray_image_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/ray_image_deduplicator.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.deduplicator.ray_image_deduplicator — data_juicer 1.0.0 documentation
+ data_juicer.ops.deduplicator.ray_image_deduplicator — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -106,7 +106,9 @@ Source code for data_juicer.ops.deduplicator.ray_image_deduplicator
return mapping[method_name]
-[docs]@OPERATORS.register_module(OP_NAME)
+
+[docs]
+@OPERATORS.register_module(OP_NAME)
@LOADED_IMAGES.register_module(OP_NAME)
class RayImageDeduplicator(RayBasicDeduplicator):
"""
@@ -114,7 +116,9 @@ Source code for data_juicer.ops.deduplicator.ray_image_deduplicator
of images between documents.
"""
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
redis_host: str = 'localhost',
redis_port: PositiveInt = 6380,
method: str = 'phash',
@@ -136,7 +140,10 @@ Source code for data_juicer.ops.deduplicator.ray_image_deduplicator
f'Can only be one of {HASH_METHOD}.')
self.hasher = get_hash_method(method)()
-[docs] def calculate_hash(self, sample, context=False):
+
+
+[docs]
+ def calculate_hash(self, sample, context=False):
if self.image_key not in sample or not sample[self.image_key]:
return RayBasicDeduplicator.EMPTY_HASH_VALUE
@@ -151,7 +158,9 @@ Source code for data_juicer.ops.deduplicator.ray_image_deduplicator
hash_value += self.hasher.encode_image(
image_array=np.array(images[key]))
- return hash_value
+ return hash_value
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html
index 7edbb0fcf..97aae970b 100644
--- a/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.deduplicator.ray_video_deduplicator — data_juicer 1.0.0 documentation
+ data_juicer.ops.deduplicator.ray_video_deduplicator — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -91,7 +91,9 @@ Source code for data_juicer.ops.deduplicator.ray_video_deduplicator
OP_NAME = 'ray_video_deduplicator'
-[docs]@OPERATORS.register_module(OP_NAME)
+
+[docs]
+@OPERATORS.register_module(OP_NAME)
@LOADED_VIDEOS.register_module(OP_NAME)
class RayVideoDeduplicator(RayBasicDeduplicator):
"""
@@ -99,7 +101,9 @@ Source code for data_juicer.ops.deduplicator.ray_video_deduplicator
of videos between documents.
"""
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
redis_host: str = 'localhost',
redis_port: PositiveInt = 6380,
*args,
@@ -116,7 +120,10 @@ Source code for data_juicer.ops.deduplicator.ray_video_deduplicator
*args,
**kwargs)
-[docs] def calculate_hash(self, sample, context=False):
+
+
+[docs]
+ def calculate_hash(self, sample, context=False):
if self.video_key not in sample or not sample[self.video_key]:
return RayBasicDeduplicator.EMPTY_HASH_VALUE
@@ -135,7 +142,9 @@ Source code for data_juicer.ops.deduplicator.ray_video_deduplicator
for key in videos:
close_video(videos[key])
- return md5_hash.hexdigest()
+ return md5_hash.hexdigest()
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/video_deduplicator.html b/_modules/data_juicer/ops/deduplicator/video_deduplicator.html
index 3a7bf5c0c..cf051c1cc 100644
--- a/_modules/data_juicer/ops/deduplicator/video_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/video_deduplicator.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.deduplicator.video_deduplicator — data_juicer 1.0.0 documentation
+ data_juicer.ops.deduplicator.video_deduplicator — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -92,7 +92,9 @@ Source code for data_juicer.ops.deduplicator.video_deduplicator
OP_NAME = 'video_deduplicator'
-[docs]@OPERATORS.register_module(OP_NAME)
+
+[docs]
+@OPERATORS.register_module(OP_NAME)
@LOADED_VIDEOS.register_module(OP_NAME)
class VideoDeduplicator(Deduplicator):
"""
@@ -100,7 +102,9 @@ Source code for data_juicer.ops.deduplicator.video_deduplicator
of videos between documents.
"""
-[docs] def __init__(self, consider_text: bool = False, *args, **kwargs):
+
+[docs]
+ def __init__(self, consider_text: bool = False, *args, **kwargs):
"""
Initialization.
@@ -115,7 +119,10 @@ Source code for data_juicer.ops.deduplicator.video_deduplicator
if self.consider_text:
self.text_dedup_op = DocumentDeduplicator(**kwargs)
-[docs] def compute_hash(self, sample, context=False):
+
+
+[docs]
+ def compute_hash(self, sample, context=False):
# get hash of text first
if self.consider_text:
sample = self.text_dedup_op.compute_hash(sample)
@@ -147,7 +154,10 @@ Source code for data_juicer.ops.deduplicator.video_deduplicator
sample[HashKeys.videohash] = md5_hash.hexdigest()
return sample
-[docs] def process(self, dataset, show_num=0):
+
+
+[docs]
+ def process(self, dataset, show_num=0):
"""
For doc-level, dataset --> dataset.
@@ -203,7 +213,9 @@ Source code for data_juicer.ops.deduplicator.video_deduplicator
_filter_dup_helper,
fn_kwargs=dict(hashes=hashes),
load_from_cache_file=False if show_num > 0 else True) # num_proc=1
- return dataset, dup_pairs
+ return dataset, dup_pairs
+
+
diff --git a/_modules/data_juicer/ops/filter/alphanumeric_filter.html b/_modules/data_juicer/ops/filter/alphanumeric_filter.html
index 829df732f..d618ef9a6 100644
--- a/_modules/data_juicer/ops/filter/alphanumeric_filter.html
+++ b/_modules/data_juicer/ops/filter/alphanumeric_filter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.filter.alphanumeric_filter — data_juicer 1.0.0 documentation
+ data_juicer.ops.filter.alphanumeric_filter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -88,14 +88,18 @@ Source code for data_juicer.ops.filter.alphanumeric_filter
OP_NAME = 'alphanumeric_filter'
-[docs]@OPERATORS.register_module('alphanumeric_filter')
+
+[docs]
+@OPERATORS.register_module('alphanumeric_filter')
class AlphanumericFilter(Filter):
"""Filter to keep samples with alphabet/numeric ratio within a specific
range."""
_batched_op = True
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
tokenization: bool = False,
min_ratio: float = 0.25,
max_ratio: float = sys.maxsize,
@@ -129,7 +133,10 @@ Source code for data_juicer.ops.filter.alphanumeric_filter
pretrained_model_name_or_path='EleutherAI/pythia-6.9b-deduped',
return_model=False)
-[docs] def compute_stats_batched(self, samples):
+
+
+[docs]
+ def compute_stats_batched(self, samples):
samples_list = samples[self.text_key]
samples_stats = samples[Fields.stats]
@@ -157,7 +164,10 @@ Source code for data_juicer.ops.filter.alphanumeric_filter
return samples
-[docs] def process_batched(self, samples):
+
+
+[docs]
+ def process_batched(self, samples):
ratio_key = StatsKeys.alpha_token_ratio if self.tokenization \
else StatsKeys.alnum_ratio
if isinstance(samples[Fields.stats], list):
@@ -170,7 +180,9 @@ Source code for data_juicer.ops.filter.alphanumeric_filter
Fields.stats][ratio_key] <= self.max_ratio:
return True
else:
- return False
+ return False
+
+
diff --git a/_modules/data_juicer/ops/filter/audio_duration_filter.html b/_modules/data_juicer/ops/filter/audio_duration_filter.html
index 70880f701..73044f949 100644
--- a/_modules/data_juicer/ops/filter/audio_duration_filter.html
+++ b/_modules/data_juicer/ops/filter/audio_duration_filter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.filter.audio_duration_filter — data_juicer 1.0.0 documentation
+ data_juicer.ops.filter.audio_duration_filter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -91,13 +91,17 @@ Source code for data_juicer.ops.filter.audio_duration_filter
OP_NAME = 'audio_duration_filter'
-[docs]@OPERATORS.register_module(OP_NAME)
+
+[docs]
+@OPERATORS.register_module(OP_NAME)
@LOADED_AUDIOS.register_module(OP_NAME)
class AudioDurationFilter(Filter):
"""Keep data samples whose audios' durations are within a specified range.
"""
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
min_duration: int = 0,
max_duration: int = sys.maxsize,
any_or_all: str = 'any',
@@ -125,7 +129,10 @@ Source code for data_juicer.ops.filter.audio_duration_filter
f'Can only be one of ["any", "all"].')
self.any = (any_or_all == 'any')
-[docs] def compute_stats_single(self, sample, context=False):
+
+
+[docs]
+ def compute_stats_single(self, sample, context=False):
# check if it's computed already
if StatsKeys.audio_duration in sample[Fields.stats]:
return sample
@@ -153,7 +160,10 @@ Source code for data_juicer.ops.filter.audio_duration_filter
return sample
-[docs] def process_single(self, sample):
+
+
+[docs]
+ def process_single(self, sample):
audio_durations = sample[Fields.stats][StatsKeys.audio_duration]
keep_bools = np.array([
self.min_duration <= duration <= self.max_duration
@@ -166,7 +176,9 @@ Source code for data_juicer.ops.filter.audio_duration_filter
if self.any:
return keep_bools.any()
else:
- return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/audio_nmf_snr_filter.html b/_modules/data_juicer/ops/filter/audio_nmf_snr_filter.html
index 79ce82297..3b9761576 100644
--- a/_modules/data_juicer/ops/filter/audio_nmf_snr_filter.html
+++ b/_modules/data_juicer/ops/filter/audio_nmf_snr_filter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.filter.audio_nmf_snr_filter — data_juicer 1.0.0 documentation
+ data_juicer.ops.filter.audio_nmf_snr_filter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -136,14 +136,18 @@ Source code for data_juicer.ops.filter.audio_nmf_snr_filter
return snr
-[docs]@OPERATORS.register_module(OP_NAME)
+
+[docs]
+@OPERATORS.register_module(OP_NAME)
@LOADED_AUDIOS.register_module(OP_NAME)
class AudioNMFSNRFilter(Filter):
"""Keep data samples whose audios' SNRs (computed based on NMF) are within
a specified range.
"""
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
min_snr: float = 0,
max_snr: float = sys.maxsize,
nmf_iter_num: PositiveInt = 500,
@@ -175,7 +179,10 @@ Source code for data_juicer.ops.filter.audio_nmf_snr_filter
f'Can only be one of ["any", "all"].')
self.any = (any_or_all == 'any')
-[docs] def compute_stats_single(self, sample, context=False):
+
+
+[docs]
+ def compute_stats_single(self, sample, context=False):
# check if it's computed already
if StatsKeys.audio_nmf_snr in sample[Fields.stats]:
return sample
@@ -203,7 +210,10 @@ Source code for data_juicer.ops.filter.audio_nmf_snr_filter
return sample
-[docs] def process_single(self, sample):
+
+
+[docs]
+ def process_single(self, sample):
audio_snrs = sample[Fields.stats][StatsKeys.audio_nmf_snr]
keep_bools = np.array(
[self.min_snr <= snr <= self.max_snr for snr in audio_snrs])
@@ -214,7 +224,9 @@ Source code for data_juicer.ops.filter.audio_nmf_snr_filter
if self.any:
return keep_bools.any()
else:
- return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/audio_size_filter.html b/_modules/data_juicer/ops/filter/audio_size_filter.html
index 145310f0f..1aef7b8fd 100644
--- a/_modules/data_juicer/ops/filter/audio_size_filter.html
+++ b/_modules/data_juicer/ops/filter/audio_size_filter.html
@@ -1,18 +1,18 @@
-
+
- data_juicer.ops.filter.audio_size_filter — data_juicer 1.0.0 documentation
+ data_juicer.ops.filter.audio_size_filter — data_juicer 1.0.1 documentation
-
-
-
+
+
+
@@ -85,13 +85,17 @@ Source code for data_juicer.ops.filter.audio_size_filter
from ..base_op import OPERATORS, Filter
-[docs]@OPERATORS.register_module('audio_size_filter')
+
+[docs]
+@OPERATORS.register_module('audio_size_filter')
class AudioSizeFilter(Filter):
"""Keep data samples whose audio size (in bytes/kb/MB/...) within a
specific range.
"""
-[docs] def __init__(self,
+
+[docs]
+ def __init__(self,
min_size: str = '0',
max_size: str = '1TB',
any_or_all: str = 'any',
@@ -119,7 +123,10 @@ Source code for data_juicer.ops.filter.audio_size_filter
f'Can only be one of ["any", "all"].')
self.any = (any_or_all == 'any')
-[docs] def compute_stats_single(self, sample, context=False):
+
+
+[docs]
+ def compute_stats_single(self, sample, context=False):
# check if it's computed already
if StatsKeys.audio_sizes in sample[Fields.stats]:
return sample
@@ -137,7 +144,10 @@ Source code for data_juicer.ops.filter.audio_size_filter
return sample
-