From d9142eb0231725519cb7056c1c0ec4aab82acd9d Mon Sep 17 00:00:00 2001 From: yxdyc Date: Thu, 28 Nov 2024 02:15:32 +0000 Subject: [PATCH] deploy: 67663166b409091d585756146e08274c71a5a059 --- data_juicer.core.html | 8 +------- searchindex.js | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/data_juicer.core.html b/data_juicer.core.html index a8f8b7f2e..6efc54d0e 100644 --- a/data_juicer.core.html +++ b/data_juicer.core.html @@ -302,14 +302,8 @@
Parameters:
    -
  • dataset_path (str) – Path (e.g. “dataset/train”) or remote URI (e.g. “s3//my-bucket/dataset/train”) +

  • dataset_path (path-like) – Path (e.g. “dataset/train”) or remote URI (e.g. “s3//my-bucket/dataset/train”) of the dataset directory where the dataset will be loaded from.

  • -
  • fs (fsspec.spec.AbstractFileSystem, optional) –

    Instance of the remote filesystem where the dataset will be saved to.

    -

    <Deprecated version=”2.8.0”>

    -

    fs was deprecated in version 2.8.0 and will be removed in 3.0.0. -Please use storage_options instead, e.g. storage_options=fs.storage_options

    -

    </Deprecated>

    -

  • keep_in_memory (bool, defaults to None) – Whether to copy the dataset in-memory. If None, the dataset will not be copied in-memory unless explicitly enabled by setting datasets.config.IN_MEMORY_MAX_SIZE to nonzero. See more details in the diff --git a/searchindex.js b/searchindex.js index a4ef2296f..bf6203507 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"cuda_device_count": [0, 14], "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "is_cuda_avail": [0, 14], "class": [1, 3, 4, 5, 7, 8, 9, 10], "columnwiseanalysi": [1, 3, 13], "dataset": [1, 3, 4, 5, 7, 8, 9, 10], "output_path": 1, "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "save_stats_in_one_fil": 1, "true": [1, 2, 3, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 7, 8, 9, 10], "object": [1, 2, 3, 8], "appli": [1, 3, 7, 9, 10], "each": [1, 3, 5, 7, 9], "column": [1, 3, 9], "stat": [1, 3, 5, 7, 8], "respect": [1, 9], "__init__": [1, 3, 4, 5, 7, 8, 9, 10], "initi": [1, 2, 3, 4, 7, 8, 9, 10], "method": [1, 3, 4, 6, 7, 8, 9, 10], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "analyz": [1, 2, 3, 13], "path": [1, 2, 3, 4, 7, 8, 9], "store": [1, 3, 4, 5, 7, 8, 9], "result": [1, 3, 8], "option": [1, 3, 4, 9], "precomput": 1, "overal": 1, "whether": [1, 2, 3, 4, 5, 6, 7, 8, 9], "save": [1, 2, 3], "all": [1, 3, 6, 8, 9], "figur": [1, 3, 9], "one": [1, 2, 6, 7, 8, 9], "imag": [1, 5, 7, 8, 9], "file": [1, 2, 3, 4, 5, 8, 9], "show_percentil": 1, "fals": [1, 2, 3, 4, 5, 6, 7, 8, 9], "show": [1, 3, 9], "skip_export": [1, 3], "draw": 1, "percentil": [1, 10], "line": [1, 2, 8, 9], "sub": [1, 6, 7], "If": [1, 3, 7, 8, 9], "": [1, 3, 7, 8, 9], "sever": [1, 3, 9], "red": 1, "indic": [1, 9], "quantil": 1, "distribut": [1, 3, 9], "singl": [1, 3, 9], "window": [1, 7], "after": [1, 3, 6, 7, 8, 9], "disk": [1, 3], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "draw_hist": 1, "ax": 1, "data": [1, 3, 4, 5, 8, 9], "save_path": 1, "histogram": 1, "includ": [1, 3, 7, 8, 9], "inform": [1, 3, 5, 7, 8, 9, 10], "draw_box": 1, "box": [1, 9], "plot": 1, "diversityanalysi": [1, 13], "lang_or_model": 1, "en": [1, 6, 8, 9], "divers": [1, 9], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "get": [1, 6], "an": [1, 3, 4, 5, 7, 8, 9], "param": [1, 2, 4, 6, 7, 9], "model": [1, 6, 7, 8, 9, 13], "specif": [1, 3, 5, 7, 8, 9], "languag": [1, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 13], "load": [1, 3, 4, 5, 9], "comput": [1, 3, 5, 6, 7, 8], "column_nam": 1, "text": [1, 4, 5, 6, 7, 8, 9], "lexic": 1, "tree": [1, 8], "name": [1, 3, 4, 5, 8, 9], "postproc_func": 1, "function": [1, 6, 7], "get_divers": 1, "postproc_kwarg": 1, "whole": [1, 8, 9], "In": [1, 3], "default": [1, 2, 3, 4, 7, 8, 9], "argument": [1, 3, 5, 8, 9], "overallanalysi": [1, 3, 13], "mean": [1, 3, 9], "std": 1, "etc": [1, 3, 4], "refine_single_column": 1, "col": 1, "num_proc": [1, 3, 4], "1": [1, 3, 4, 8, 9], "describ": 1, "panda": 1, "number": [1, 3, 4, 5, 7, 8, 9, 10], "process": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13], "export": [1, 3, 4, 5, 13], "init_config": [2, 13], "arg": [2, 3, 4, 5, 7, 8, 9, 10], "list": [2, 3, 4, 5, 6, 8, 9], "str": [2, 3, 4, 6, 7, 8, 9, 10], "jsonargpars": [2, 3], "parser": 2, "pars": [2, 9], "from": [2, 3, 4, 5, 6, 7, 8, 9, 10], "posix": 2, "style": 2, "command": [2, 4, 9], "yaml": [2, 9], "json": [2, 3, 4, 8], "jsonnet": 2, "superset": 2, "environ": [2, 3], "variabl": 2, "hard": 2, "code": [2, 9], "e": [2, 3, 4, 8, 9], "g": [2, 3, 4, 9], "conifg": 2, "cfg": [2, 3, 4], "defaut": 2, "global": [2, 4, 9], "executor": [2, 3, 13], "get_init_config": [2, 13], "namespac": [2, 3], "dict": [2, 3, 9], "set": [2, 3, 6, 8, 9, 10], "init": 2, "datajuc": 2, "export_config": [2, 13], "format": [2, 3, 8, 9, 13], "skip_non": 2, "bool": [2, 3, 7, 8, 9, 10], "skip_check": 2, "overwrit": [2, 9], "multifil": 2, "some": [2, 9], "ar": [2, 3, 6, 7, 8, 9, 10], "type": [2, 3, 4, 9], "json_ind": 2, "parser_mod": 2, "exclud": 2, "entri": 2, "whose": [2, 8, 9], "valu": [2, 3, 5, 7, 8, 9, 10], "i": [2, 3, 4, 5, 6, 7, 8, 9], "skip": [2, 3], "check": [2, 9], "exist": 2, "multipl": [2, 3, 4, 6, 7, 8], "__path__": 2, "meta": [2, 4], "merge_config": [2, 13], "ori_cfg": 2, "new_cfg": 2, "merg": [2, 4, 6, 8], "configur": [2, 3, 4, 9], "origin": [2, 3, 8, 9], "expect": [2, 3, 9], "cfg_after_merg": 2, "prepare_side_config": [2, 13], "ori_config": 2, "string": [2, 7, 8, 9], "yml": 2, "adapt": [3, 13], "max_batch_s": 3, "10000": 3, "static": 3, "execute_and_prob": 3, "oper": [3, 8], "sample_interv": 3, "0": [3, 4, 5, 7, 8, 9], "5": [3, 7, 8, 9], "input": [3, 5, 7, 8, 9, 10], "probe": 3, "relat": [3, 8, 9], "op": [3, 13], "specifi": [3, 4, 6, 8, 9, 10], "For": [3, 5, 7, 8, 9], "now": [3, 6, 9], "we": [3, 4, 7, 8, 9, 13], "support": [3, 8, 9], "follow": [3, 9], "target": [3, 8, 9, 10], "resourc": 3, "util": [3, 8], "speed": 3, "averag": [3, 8], "The": [3, 4, 5, 8, 9, 10], "item": [3, 5, 9], "take_batch": 3, "config": [3, 5, 9, 13], "split": [3, 6, 9], "batch": [3, 9], "factor": 3, "size": [3, 6, 7, 8, 9], "iter": [3, 8, 9], "adapt_workload": 3, "manag": [3, 9], "schedul": 3, "balanc": 3, "need": [3, 6, 8, 9, 10], "recip": 3, "probe_small_batch": 3, "perform": 3, "small": [3, 8, 9], "pre": [3, 9], "execut": 3, "avail": [3, 8], "current": 3, "estim": 3, "rank": [3, 8, 9, 10], "notic": [3, 9], "should": [3, 7, 8, 9], "run": [3, 5, 8, 9], "cach": [3, 8], "enabl": [3, 9], "A": [3, 5, 7, 9], "length": [3, 4, 8, 9], "batch_size_strategi": 3, "load_analysis_r": 3, "base_b": 3, "util_th": 3, "9": [3, 8, 9], "decid": [3, 5, 7, 8], "accord": [3, 4, 5, 8, 9], "workload": 3, "analysi": [3, 13], "threshold": [3, 7, 8, 9], "guarante": 3, "won": [3, 7], "t": [3, 4, 6, 7], "exce": [3, 8, 9], "onli": [3, 7, 8, 9], "consid": [3, 7, 8, 9], "bucket": 3, "effect": 3, "which": [3, 5, 7, 8, 9], "max": [3, 4, 7, 8, 9], "except": [3, 9], "gpu": 3, "thi": [3, 4, 5, 6, 7, 8, 9, 10], "It": [3, 4, 7, 8, 9], "filter": [3, 5, 7, 9, 13], "gener": [3, 9], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "better": [3, 8], "load_data_np": 3, "int": [3, 4, 7, 8, 9, 10], "skip_return": 3, "pipelin": 3, "worker": 3, "when": [3, 4, 5, 7, 8, 9, 10], "api": [3, 9], "call": [3, 9], "nesteddataset": [3, 13], "karg": 3, "djdataset": 3, "enhanc": 3, "huggingfac": [3, 4, 8, 9], "usabl": 3, "effici": 3, "work_dir": 3, "checkpoint": 3, "tracer": [3, 5, 7, 13], "map": [3, 9], "overrid": 3, "func": 3, "most": [3, 9], "common": [3, 13], "can": [3, 8, 9], "access": 3, "nest": 3, "manner": 3, "select": [3, 4, 5, 8, 9, 10], "classmethod": [3, 4], "from_dict": 3, "from_xx": 3, "constructor": 3, "construct": [3, 9], "add_column": 3, "add": [3, 4, 9], "select_column": 3, "remove_column": 3, "remov": [3, 5, 6, 8, 9], "cleanup_cache_fil": 3, "clear": 3, "raw": [3, 9], "compress": 3, "load_from_disk": 3, "wa": [3, 9], "previous": 3, "save_to_disk": 3, "directori": [3, 4, 8], "filesystem": 3, "ani": [3, 6, 8, 9], "implement": [3, 7], "fsspec": 3, "spec": 3, "abstractfilesystem": 3, "dataset_path": [3, 4], "train": [3, 9], "remot": [3, 9], "uri": 3, "s3": 3, "my": 3, "where": 3, "f": [3, 4], "instanc": [3, 5], "deprec": 3, "version": [3, 9], "2": [3, 6, 8, 9], "8": [3, 8, 9], "3": [3, 8, 9], "pleas": [3, 7, 9], "storage_opt": 3, "instead": [3, 4, 6], "keep_in_memori": 3, "copi": 3, "memori": 3, "unless": 3, "explicitli": 3, "in_memory_max_s": 3, "nonzero": 3, "see": [3, 13], "more": [3, 8, 9, 13], "detail": [3, 8, 9, 13], "improv": 3, "section": 3, "kei": [3, 4, 5, 8, 9, 10], "pair": [3, 5, 7, 8, 9], "pass": [3, 9], "system": [3, 9], "backend": 3, "ad": [3, 6, 9], "request": [3, 9], "datasetdict": 3, "exampl": [3, 8, 9], "py": [3, 4], "d": [3, 4, 9], "unifi": [3, 4], "order": [3, 9, 10], "sample_data": 3, "dataset_to_sampl": 3, "sample_ratio": 3, "float": [3, 7, 8, 9, 10], "sample_algo": 3, "uniform": [3, 8, 9], "kwarg": [3, 4, 5, 7, 8, 9, 10], "subset": [3, 4], "given": [3, 8, 9], "formatt": [3, 4], "link": [3, 9], "ratio": [3, 4, 6, 8, 9, 10], "algorithm": [3, 7, 9], "frequency_specified_field_selector": 3, "topk_specified_field_selector": 3, "export_path": 3, "export_shard_s": 3, "export_in_parallel": 3, "export_d": 3, "keep_stats_in_res_d": 3, "keep_hashes_in_res_d": 3, "export_stat": 3, "kib": 3, "1024": 3, "mib": 3, "1048576": 3, "gib": 3, "1073741824": 3, "tib": 3, "1099511627776": 3, "shard": 3, "content": [3, 9], "keep": [3, 5, 7, 8, 9], "hash": [3, 5, 7], "export_compute_stat": 3, "statu": 3, "to_jsonl": 3, "jsonl": [3, 4], "extra": [3, 4, 7, 8, 9, 10], "to_json": 3, "to_parquet": 3, "parquet": [3, 4], "monitor": [3, 13], "other": [3, 8, 9], "dure": 3, "python": 3, "time": [3, 9], "10": [3, 8, 9], "interv": 3, "timestamp": 3, "xxx": 3, "cpu": 3, "count": [3, 8], "free": 3, "mem": 3, "structur": 3, "abov": [3, 9], "field": [3, 4, 5, 7, 8, 9, 10], "first": [3, 6, 7, 8, 9], "level": [3, 5, 6, 7, 8, 9, 10], "resource_analysi": 3, "min": [3, 7, 8, 9], "avg": [3, 8], "those": [3, 8, 9], "dynamic_field": 3, "monitor_all_resourc": 3, "detect": [3, 7, 8, 9], "node": 3, "monitor_current_resourc": 3, "machin": 3, "rang": [3, 8, 9, 10], "mb": [3, 8], "draw_resource_util_graph": 3, "resource_util_list": 3, "store_dir": 3, "analyze_resource_util_list": 3, "metric": [3, 5, 7, 8], "analyze_single_resource_util": 3, "resource_util_dict": 3, "monitor_func": 3, "show_num": [3, 5, 7], "trace": [3, 5, 7], "chang": [3, 9], "befor": [3, 8], "comparison": 3, "work": [3, 8, 9], "maximum": [3, 8, 9], "trace_mapp": 3, "op_nam": 3, "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": 3, "mapper": [3, 5, 13], "mainli": 3, "differ": [3, 4, 6, 7, 8, 9], "due": 3, "modif": 3, "trace_batch_mapp": 3, "batchmapp": 3, "new": [3, 4, 9], "augment": [3, 6, 8, 9], "trace_filt": 3, "trace_dedupl": 3, "dup_pair": 3, "dedupl": [3, 5, 9, 13], "duplic": [3, 5, 7], "extract": [3, 8, 9], "two": [3, 7, 8, 9], "embed": 3, "independ": [3, 8, 9], "obtain": [3, 6], "load_formatt": [4, 13], "generated_dataset_config": [4, 9], "suffix": [4, 8], "add_suffix": 4, "baseformatt": 4, "mixtur": 4, "weight": [4, 7, 9], "creat": 4, "provid": [4, 7, 9], "must": [4, 8, 9], "contain": [4, 6, 8, 9], "info": [4, 5], "jsonformatt": [4, 13], "localformatt": [4, 13], "zst": 4, "local": 4, "packag": 4, "modul": [4, 13], "csv": 4, "load_dataset": 4, "global_cfg": 4, "its": [4, 5, 7, 9], "consequ": 4, "remoteformatt": [4, 13], "repositori": 4, "hub": 4, "textformatt": [4, 13], "txt": [4, 8], "pdf": [4, 8], "cpp": 4, "docx": [4, 8], "md": 4, "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "c": 4, "h": [4, 8, 9], "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 8, 9], "java": 4, "j": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "rb": 4, "r": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "tsx": 4, "vb": 4, "makefil": 4, "xml": [4, 8, 9], "rst": 4, "m": [4, 9], "smali": 4, "datas": 4, "unified_format_dataset": 4, "parquetformatt": [4, 13], "csvformatt": [4, 13], "tsvformatt": [4, 13], "tsv": 4, "delimit": [4, 9], "mixtureformatt": [4, 13], "max_sampl": 4, "mix": [4, 9], "randomli": [4, 9], "everi": 4, "them": [4, 7, 8, 9], "datasset": 4, "dir": 4, "w1": 4, "w2": 4, "ds_dir": 4, "w3": 4, "ds_file": 4, "random_sampl": 4, "sample_numb": 4, "seed": [4, 9], "bigger": [4, 9], "than": [4, 6, 7, 8, 9, 10], "random": [4, 9, 10], "42": 4, "emptyformatt": [4, 9, 13], "feature_kei": [4, 9], "empti": [4, 7, 9], "featur": 4, "properti": 4, "null_valu": 4, "rayemptyformatt": [4, 9, 13], "rai": [4, 7, 9], "load_op": [5, 13], "process_list": 5, "image_kei": 5, "audio_kei": 5, "audio": [5, 8, 9], "video_kei": [5, 9], "video": [5, 7, 8, 9], "compute_stats_batch": [5, 8], "process_batch": [5, 8, 9], "compute_stats_singl": [5, 7, 8], "context": [5, 7, 8, 9], "intermedi": [5, 7, 8], "var": [5, 7, 8], "temporarili": [5, 7, 8], "process_singl": [5, 7, 8, 9], "boolean": [5, 7, 8], "reduc": [5, 8, 9], "conduct": 5, "edit": 5, "compute_hash": [5, 7], "doc": [5, 7], "open": [5, 7, 9], "selector": [5, 13], "get_sentences_from_docu": [6, 13], "document": [6, 7, 8, 9], "model_func": 6, "sentenc": [6, 9], "splite": 6, "separ": [6, 8, 9, 10], "n": [6, 8, 9], "get_words_from_docu": [6, 13], "token_func": 6, "new_lin": 6, "tab": 6, "word": [6, 8, 9], "like": [6, 7, 8, 9], "stopword": [6, 8], "token": [6, 7, 8, 9], "merge_on_whitespace_tab_newlin": [6, 13], "invert": 6, "split_on_newline_tab_whitespac": [6, 13], "concaten": [6, 9], "split_on_whitespac": [6, 13], "also": 6, "space": [6, 7], "tag": [6, 8, 9], "strip": [6, 13], "strip_charact": 6, "wai": [6, 9], "faster": 6, "sinc": 6, "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "words_augment": [6, 13], "group_siz": 6, "join_char": 6, "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 9], "vietnames": [6, 8], "syllabl": 6, "group": [6, 8], "words_refin": [6, 13], "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "words_aug_join_char": [6, 8], "refin": 6, "non": [6, 7, 9], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7, 9], "lower": [6, 7, 8, 9, 10], "case": [6, 7, 8, 9, 13], "lowercas": [6, 7, 9], "char": [6, 8, 9], "split_text_by_punctu": [6, 13], "zh": [6, 8], "punctuat": [6, 7, 9], "documentdedupl": [7, 13], "ignore_non_charact": 7, "exact": 7, "match": [7, 8, 9], "md5": 7, "ignor": [7, 9], "alphabet": [7, 8, 9], "whitespac": [7, 9], "digit": 7, "documentminhashdedupl": [7, 13], "window_s": 7, "ignore_pattern": 7, "num_permut": 7, "256": 7, "jaccard_threshold": 7, "7": [7, 9], "num_band": 7, "num_rows_per_band": 7, "tokenizer_model": 7, "minhashlsh": 7, "simhash": 7, "minhash": 7, "byte": [7, 8], "so": [7, 8, 9], "thei": [7, 9], "kept": [7, 8, 9], "final": [7, 9], "sentencepiec": 7, "english": [7, 8, 9], "recommend": [7, 9], "shingl": 7, "pattern": [7, 9], "permut": 7, "jaccard": 7, "similar": [7, 8, 9], "regard": [7, 9], "band": 7, "lsh": 7, "determin": [7, 9, 10], "optim": [7, 9], "minim": 7, "sum": 7, "prob": 7, "posit": [7, 8, 9], "neg": [7, 9], "row": 7, "documentsimhashdedupl": [7, 13], "6": [7, 8, 9], "num_block": 7, "hamming_dist": 7, "4": [7, 8, 9], "And": [7, 9], "block": 7, "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8, 9, 10], "imagededupl": [7, 13], "phash": 7, "consider_text": 7, "togeth": [7, 9], "raybasicdedupl": [7, 13], "redis_host": 7, "localhost": 7, "redis_port": 7, "6380": 7, "basic": 7, "although": 7, "empty_hash_valu": 7, "hostnam": 7, "redi": 7, "server": 7, "port": 7, "calculate_hash": 7, "calcul": [7, 8, 9], "raydocumentdedupl": [7, 13], "rayimagededupl": [7, 13], "rayvideodedupl": [7, 13], "videodedupl": [7, 13], "alphanumericfilt": [8, 13], "min_ratio": [8, 9], "25": 8, "max_ratio": [8, 9], "9223372036854775807": [8, 9], "numer": [8, 9], "within": [8, 9, 10], "alphanumer": 8, "total": [8, 9], "below": [8, 9], "audiodurationfilt": [8, 13], "min_dur": 8, "max_dur": 8, "any_or_al": [8, 9], "durat": [8, 9], "second": [8, 9], "sy": 8, "maxsiz": 8, "strategi": [8, 9], "meet": [8, 9], "condit": [8, 9], "audionmfsnrfilt": [8, 13], "min_snr": 8, "max_snr": 8, "nmf_iter_num": 8, "500": [8, 9], "snr": 8, "nmf": 8, "db": 8, "audiosizefilt": [8, 13], "min_siz": 8, "max_siz": 8, "1tb": 8, "kb": 8, "constraint": 8, "approxim": 8, "un": 8, "limit": 8, "averagelinelengthfilt": [8, 13], "min_len": [8, 9], "max_len": [8, 9], "characterrepetitionfilt": [8, 13], "rep_len": 8, "gram": 8, "repetit": 8, "flaggedwordfilt": [8, 13], "lang": [8, 9], "045": 8, "flagged_words_dir": 8, "home": 8, "runner": 8, "asset": 8, "flag": 8, "what": [8, 9], "adopt": 8, "flagged_word": 8, "join": 8, "imageaestheticsfilt": [8, 13], "hf_scorer_model": 8, "trust_remote_cod": [8, 9], "min_scor": 8, "max_scor": 8, "aesthet": 8, "score": [8, 9], "predictor": 8, "By": [8, 9], "shunk031": 8, "v2": 8, "sac": 8, "logo": 8, "ava1": 8, "l14": 8, "linearms": 8, "refer": [8, 9], "pypi": 8, "org": [8, 9], "project": 8, "simpl": [8, 9], "predict": 8, "keyword": [8, 9], "imageaspectratiofilt": [8, 13], "333": 8, "aspect": [8, 9], "aspectratio": [8, 9], "w": [8, 9], "imagefacecountfilt": [8, 13], "cv_classifi": [8, 9], "min_face_count": 8, "max_face_count": 8, "face": [8, 9], "opencv": [8, 9], "classifi": [8, 9], "haarcascade_frontalface_alt": [8, 9], "minimum": [8, 9], "requir": 8, "imagefaceratiofilt": [8, 13], "area": 8, "largest": [8, 10], "imagensfwfilt": [8, 13], "hf_nsfw_model": 8, "falconsai": 8, "nsfw_image_detect": 8, "score_threshold": 8, "have": [8, 9], "low": 8, "nsfw": 8, "imagepairsimilarityfilt": [8, 13], "hf_clip": 8, "openai": 8, "clip": [8, 9], "vit": 8, "patch32": 8, "closedunitinterv": 8, "imageshapefilt": [8, 13], "min_width": [8, 9], "max_width": [8, 9], "min_height": [8, 9], "max_height": [8, 9], "shape": 8, "width": [8, 9], "height": [8, 9], "imagesizefilt": [8, 13], "imagetextmatchingfilt": [8, 13], "hf_blip": 8, "salesforc": [8, 9], "blip": [8, 9], "itm": 8, "coco": 8, "003": 8, "horizontal_flip": [8, 9], "vertical_flip": [8, 9], "reduce_mod": 8, "flip": [8, 9], "horizont": [8, 9], "left": [8, 9], "right": [8, 9], "vertic": [8, 9], "top": [8, 9, 10], "bottom": [8, 9], "mode": [8, 9], "correspond": [8, 9, 10], "chunk": [8, 9], "take": 8, "imagetextsimilarityfilt": [8, 13], "imagewatermarkfilt": [8, 13], "hf_watermark_model": 8, "amrul": 8, "hzz": 8, "watermark_detector": 8, "prob_threshold": 8, "watermark": [8, 9], "high": [8, 9], "probabl": [8, 9], "languageidscorefilt": [8, 13], "confid": 8, "larger": [8, 9, 10], "identif": 8, "maximumlinelengthfilt": [8, 13], "perplexityfilt": [8, 13], "max_ppl": 8, "1500": 8, "perplex": 8, "phrasegroundingrecallfilt": [8, 13], "hf_owlvit": 8, "googl": 8, "owlvit": 8, "min_recal": 8, "max_recal": 8, "iou_thr": 8, "large_area_ratio_thr": 8, "95": [8, 9], "conf_thr": 8, "locat": [8, 9], "recal": 8, "phrase": 8, "owl": 8, "ground": 8, "iou": 8, "nm": 8, "post": 8, "bbox": 8, "overlap": [8, 9], "out": 8, "larg": 8, "account": 8, "specialcharactersfilt": [8, 13], "specifiedfieldfilt": [8, 13], "field_kei": [8, 10], "target_valu": 8, "multi": [8, 9, 10, 13], "retain": [8, 9], "specifiednumericfieldfilt": [8, 13], "min_valu": 8, "max_valu": 8, "specifiednumericfield": 8, "stopwordsfilt": [8, 13], "stopwords_dir": 8, "suffixfilt": [8, 13], "textactionfilt": [8, 13], "min_action_num": 8, "action": [8, 9], "mini_action_num": 8, "textentitydependencyfilt": [8, 13], "min_dependency_num": 8, "identifi": [8, 9], "entiti": [8, 9], "omit": 8, "mini_dependency_num": 8, "edg": [8, 9], "depend": [8, 9], "objet": 8, "textlengthfilt": [8, 13], "tokennumfilt": [8, 13], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "hug": [8, 9], "videoaestheticsfilt": [8, 13], "frame_sampling_method": [8, 9], "frame_num": [8, 9], "frame": [8, 9], "all_keyfram": [8, 9], "former": [8, 9], "latter": [8, 9], "uniformli": [8, 9], "keyfram": 8, "while": 8, "usual": 8, "term": 8, "middl": [8, 9], "last": [8, 9], "addit": [8, 9], "videoaspectratiofilt": [8, 13], "21": [8, 9], "videodurationfilt": [8, 13], "videoframestextsimilarityfilt": [8, 13], "kind": [8, 9], "chineseclip": 8, "might": [8, 9], "choic": [8, 9], "videomotionscorefilt": [8, 13], "7976931348623157e": 8, "308": 8, "sampling_fp": 8, "tupl": 8, "divis": [8, 9], "rel": 8, "motion": 8, "farneback": 8, "algorith": 8, "dens": 8, "optic": 8, "flow": 8, "rate": 8, "frames_per_second": 8, "resiz": [8, 9], "sequenc": [8, 9], "smaller": [8, 9, 10], "rescal": 8, "allow": [8, 9], "longer": 8, "greater": [8, 9, 10], "being": [8, 9], "overrul": 8, "equal": [8, 9, 10], "As": 8, "mai": [8, 9], "shorter": [8, 9], "dimens": [8, 9], "magnitud": 8, "normal": [8, 9], "diagon": 8, "setup_model": 8, "compute_flow": 8, "prev_fram": 8, "curr_fram": 8, "videomotionscoreraftfilt": [8, 13], "raft": 8, "recurr": 8, "transform": [8, 9], "torchvis": 8, "further": 8, "offici": 8, "http": [8, 9], "pytorch": 8, "vision": [8, 9], "main": [8, 9], "paper": 8, "here": [8, 9, 13], "arxiv": 8, "ab": 8, "2003": 8, "12039": 8, "videonsfwfilt": [8, 13], "videoocrarearatiofilt": [8, 13], "min_area_ratio": 8, "max_area_ratio": 8, "frame_sample_num": 8, "languages_to_detect": 8, "ch_sim": 8, "ocr": [8, 9], "evenli": 8, "full": [8, 9], "found": [8, 9], "www": 8, "jaid": 8, "ai": [8, 9], "easyocr": 8, "get_read": 8, "videoresolutionfilt": [8, 13], "resolut": [8, 9], "videotaggingfromframesfilt": [8, 13], "peopl": 8, "tag_field_nam": [8, 9], "__dj__video_frame_tags__": [8, 9], "shift": [8, 9], "github": 8, "com": 8, "xinyu1205": 8, "recogn": 8, "anyth": 8, "blob": 8, "ram": 8, "ram_tag_list": 8, "noqa": 8, "e501": 8, "videowatermarkfilt": [8, 13], "wordrepetitionfilt": [8, 13], "wordsnumfilt": [8, 13], "audioffmpegwrappedmapp": [9, 13], "filter_nam": 9, "filter_kwarg": 9, "global_arg": 9, "capture_stderr": 9, "overwrite_output": 9, "wrapper": 9, "ffmpeg": 9, "captur": 9, "stderr": 9, "output": 9, "calibrateqamapp": [9, 13], "api_model": 9, "gpt": 9, "4o": 9, "api_endpoint": 9, "response_path": 9, "system_prompt": 9, "input_templ": 9, "reference_templ": 9, "qa_pair_templ": 9, "output_pattern": 9, "try_num": 9, "model_param": 9, "sampling_param": 9, "calibr": 9, "question": 9, "answer": 9, "default_system_prompt": 9, "\u8bf7\u6839\u636e\u63d0\u4f9b\u7684": 9, "\u53c2\u8003\u4fe1\u606f": 9, "\u5bf9": 9, "\u95ee\u9898": 9, "\u548c": 9, "\u56de\u7b54": 9, "\u8fdb\u884c\u6821\u51c6": 9, "\u4f7f\u5176\u66f4\u52a0\u8be6\u7ec6": 9, "\u51c6\u786e": 9, "n\u6309\u7167\u4ee5\u4e0b\u683c\u5f0f\u8f93\u51fa": 9, "n\u6821\u51c6\u540e\u7684\u95ee\u9898": 9, "n\u6821\u51c6\u540e\u7684\u56de\u7b54": 9, "default_input_templ": 9, "qa_pair": 9, "default_reference_templ": 9, "default_qa_pair_templ": 9, "default_output_pattern": 9, "url": 9, "endpoint": 9, "respons": 9, "messag": 9, "prompt": 9, "task": 9, "templat": 9, "build": 9, "regular": 9, "express": 9, "temperatur": 9, "top_p": 9, "build_input": 9, "parse_output": 9, "raw_output": 9, "calibratequerymapp": [9, 13], "queri": 9, "\u5bf9\u95ee\u7b54\u5bf9\u4e2d\u7684": 9, "\u4e14\u4ecd\u53ef\u4ee5\u7531\u539f\u7b54\u6848\u56de\u7b54": 9, "\u53ea\u8f93\u51fa\u6821\u51c6\u540e\u7684\u95ee\u9898": 9, "\u4e0d\u8981\u8f93\u51fa\u591a\u4f59\u5185\u5bb9": 9, "calibrateresponsemapp": [9, 13], "\u4e14\u4ecd\u53ef\u4ee5\u56de\u7b54\u539f\u95ee\u9898": 9, "\u53ea\u8f93\u51fa\u6821\u51c6\u540e\u7684\u56de\u7b54": 9, "chineseconvertmapp": [9, 13], "s2t": 9, "tradit": 9, "simplifi": 9, "japanes": 9, "kanji": 9, "choos": 9, "t2": 9, "s2tw": 9, "taiwan": 9, "standard": 9, "tw2": 9, "s2hk": 9, "hong": 9, "kong": 9, "variant": 9, "hk2": 9, "s2twp": 9, "taiwanes": 9, "idiom": 9, "tw2sp": 9, "mainland": 9, "t2tw": 9, "tw2t": 9, "hk2t": 9, "t2hk": 9, "t2jp": 9, "ky\u016bjitai": 9, "jp2t": 9, "shinjitai": 9, "cleancopyrightmapp": [9, 13], "clean": 9, "copyright": 9, "comment": 9, "begin": 9, "cleanemailmapp": [9, 13], "repl": 9, "email": 9, "search": [9, 13], "replac": 9, "cleanhtmlmapp": [9, 13], "cleanipmapp": [9, 13], "ipv4": 9, "ipv6": 9, "address": 9, "cleanlinksmapp": [9, 13], "ftp": 9, "expandmacromapp": [9, 13], "expand": 9, "macro": 9, "definit": 9, "bodi": 9, "latex": 9, "extractentityattributemapp": [9, 13], "query_ent": 9, "query_attribut": 9, "entity_kei": 9, "__dj__main_entity__": 9, "attribute_kei": 9, "__dj__attribute__": 9, "attribute_desc_kei": 9, "__dj__attribute_description__": 9, "support_text_kei": 9, "__dj__attribute_support_text__": 9, "system_prompt_templ": 9, "attr_pattern_templ": 9, "demo_pattern": 9, "drop_text": 9, "attribut": 9, "default_system_prompt_templ": 9, "\u7ed9\u5b9a\u4e00\u6bb5\u6587\u672c": 9, "\u4ece\u6587\u672c\u4e2d\u603b\u7ed3": 9, "\u7684": 9, "\u5e76\u4e14\u4ece\u539f\u6587\u6458\u5f55\u6700\u80fd\u8bf4\u660e\u8be5": 9, "\u7684\u4ee3\u8868\u6027\u793a\u4f8b": 9, "n\u8981\u6c42": 9, "\u6458\u5f55\u7684\u793a\u4f8b\u5e94\u8be5\u7b80\u77ed": 9, "\u9075\u5faa\u5982\u4e0b\u7684\u56de\u590d\u683c\u5f0f": 9, "\u63cf\u8ff0": 9, "\u4ee3\u8868\u6027\u793a\u4f8b1": 9, "n\u8bf4\u660e": 9, "\u8be5": 9, "\u7684\u539f\u6587\u6458\u5f551": 9, "\u4ee3\u8868\u6027\u793a\u4f8b2": 9, "\u7684\u539f\u6587\u6458\u5f552": 9, "\u6587\u672c": 9, "default_attr_pattern_templ": 9, "z": 9, "default_demon_pattern": 9, "\u4ee3\u8868\u6027\u793a\u4f8b": 9, "__dj__entity__": 9, "entity_attribute_kei": 9, "descript": 9, "__dj__support_text__": 9, "retri": 9, "attempt": 9, "error": 9, "drop": 9, "demonstract": 9, "attribute_nam": 9, "extractentityrelationmapp": [9, 13], "entity_typ": 9, "relation_kei": 9, "__dj__relation__": 9, "prompt_templ": 9, "tuple_delimit": 9, "record_delimit": 9, "completion_delimit": 9, "max_glean": 9, "continue_prompt": 9, "if_loop_prompt": 9, "entity_pattern": 9, "relation_pattern": 9, "knowledg": 9, "graph": 9, "default_prompt_templ": 9, "goal": 9, "ngiven": 9, "potenti": 9, "relev": 9, "activ": 9, "relationship": 9, "among": 9, "step": 9, "n1": 9, "entity_nam": 9, "One": 9, "entity_descript": 9, "comprehens": 9, "nformat": 9, "n2": 9, "source_ent": 9, "target_ent": 9, "clearli": 9, "nfor": 9, "relationship_descript": 9, "explan": 9, "why": 9, "you": 9, "think": 9, "relationship_strength": 9, "strength": 9, "relationship_keyword": 9, "summar": 9, "overarch": 9, "natur": 9, "focus": 9, "concept": 9, "theme": 9, "rather": 9, "n3": 9, "n4": 9, "finish": 9, "nexampl": 9, "nentity_typ": 9, "person": 9, "technologi": 9, "mission": 9, "organ": 9, "ntext": 9, "nwhile": 9, "alex": 9, "clench": 9, "hi": 9, "jaw": 9, "buzz": 9, "frustrat": 9, "dull": 9, "against": 9, "backdrop": 9, "taylor": 9, "authoritarian": 9, "certainti": 9, "competit": 9, "undercurr": 9, "him": 9, "alert": 9, "sens": 9, "jordan": 9, "share": 9, "commit": 9, "discoveri": 9, "unspoken": 9, "rebellion": 9, "cruz": 9, "narrow": 9, "control": 9, "nthen": 9, "did": 9, "someth": 9, "unexpect": 9, "paus": 9, "besid": 9, "moment": 9, "observ": 9, "devic": 9, "akin": 9, "rever": 9, "tech": 9, "understood": 9, "said": 9, "voic": 9, "quieter": 9, "could": 9, "game": 9, "u": 9, "nthe": 9, "underli": 9, "dismiss": 9, "earlier": 9, "seem": 9, "falter": 9, "glimps": 9, "reluct": 9, "graviti": 9, "lai": 9, "hand": 9, "look": 9, "up": 9, "fleet": 9, "heartbeat": 9, "ey": 9, "lock": 9, "wordless": 9, "clash": 9, "wills": 9, "soften": 9, "uneasi": 9, "truce": 9, "nit": 9, "bare": 9, "percept": 9, "note": 9, "inward": 9, "nod": 9, "had": 9, "been": 9, "brought": 9, "noutput": 9, "who": 9, "experi": 9, "dynam": 9, "portrai": 9, "toward": 9, "perspect": 9, "ha": 9, "signific": 9, "interact": 9, "associ": 9, "influenc": 9, "central": 9, "stori": 9, "implic": 9, "affect": 9, "attitud": 9, "power": 9, "contrast": 9, "directli": 9, "lead": 9, "mutual": 9, "conflict": 9, "ideolog": 9, "import": 9, "impact": 9, "technolog": 9, "\u4eba\u7269": 9, "\u6280\u672f": 9, "\u4efb\u52a1": 9, "\u7ec4\u7ec7": 9, "\u5730\u70b9": 9, "n\u4ed6\u4eec\u4e0d\u518d\u662f\u5355\u7eaf\u7684\u6267\u884c\u8005": 9, "\u4ed6\u4eec\u5df2\u6210\u4e3a\u67d0\u4e2a\u8d85\u8d8a\u661f\u8fb0\u4e0e\u6761\u7eb9\u7684\u9886\u57df\u7684\u4fe1\u606f\u5b88\u62a4\u8005": 9, "\u8fd9\u4e00\u4f7f\u547d\u7684\u63d0\u5347\u4e0d\u80fd\u88ab\u89c4\u5219\u548c\u65e2\u5b9a\u534f\u8bae\u6240\u675f\u7f1a": 9, "\u5b83\u9700\u8981\u4e00\u79cd\u65b0\u7684\u89c6\u89d2": 9, "\u4e00\u79cd\u65b0\u7684\u51b3\u5fc3": 9, "n\u968f\u7740\u4e0e\u534e\u76db\u987f\u7684\u901a\u8baf\u5728\u80cc\u666f\u4e2d\u55e1\u55e1\u4f5c\u54cd": 9, "\u5bf9\u8bdd\u4e2d\u7684\u7d27\u5f20\u60c5\u7eea\u901a\u8fc7\u561f\u561f\u58f0\u548c\u9759\u7535\u566a\u97f3\u8d2f\u7a7f\u59cb\u7ec8": 9, "\u56e2\u961f\u7ad9\u7acb\u7740": 9, "\u4e00\u80a1\u4e0d\u7965\u7684\u6c14\u606f\u7b3c\u7f69\u7740\u4ed6\u4eec": 9, "\u663e\u7136": 9, "\u4ed6\u4eec\u5728\u63a5\u4e0b\u6765\u51e0\u4e2a\u5c0f\u65f6\u5185\u505a\u51fa\u7684\u51b3\u5b9a\u53ef\u80fd\u4f1a\u91cd\u65b0\u5b9a\u4e49\u4eba\u7c7b\u5728\u5b87\u5b99\u4e2d\u7684\u4f4d\u7f6e": 9, "\u6216\u8005\u5c06\u4ed6\u4eec\u7f6e\u4e8e\u65e0\u77e5\u548c\u6f5c\u5728\u5371\u9669\u4e4b\u4e2d": 9, "n\u968f\u7740\u4e0e\u661f\u8fb0\u7684\u8054\u7cfb\u53d8\u5f97\u66f4\u52a0\u7262\u56fa": 9, "\u5c0f\u7ec4\u5f00\u59cb\u5904\u7406\u9010\u6e10\u6210\u5f62\u7684\u8b66\u544a": 9, "\u4ece\u88ab\u52a8\u63a5\u53d7\u8005\u8f6c\u53d8\u4e3a\u79ef\u6781\u53c2\u4e0e\u8005": 9, "\u6885\u745f\u540e\u6765\u7684\u76f4\u89c9\u5360\u636e\u4e86\u4e0a\u98ce": 9, "\u56e2\u961f\u7684\u4efb\u52a1\u5df2\u7ecf\u6f14\u53d8": 9, "\u4e0d\u518d\u4ec5\u4ec5\u662f\u89c2\u5bdf\u548c\u62a5\u544a": 9, "\u800c\u662f\u4e92\u52a8\u548c\u51c6\u5907": 9, "\u4e00\u573a\u8715\u53d8\u5df2\u7ecf\u5f00\u59cb": 9, "\u800c": 9, "\u675c\u5c14\u585e\u884c\u52a8": 9, "\u5219\u4ee5\u4ed6\u4eec\u5927\u80c6\u7684\u65b0\u9891\u7387\u9707\u52a8": 9, "\u8fd9\u79cd\u57fa\u8c03\u4e0d\u662f\u7531\u4e16\u4fd7\u8bbe\u5b9a\u7684": 9, "\u534e\u76db\u987f": 9, "\u534e\u76db\u987f\u662f\u6b63\u5728\u63a5\u6536\u901a\u8baf\u7684\u5730\u65b9": 9, "\u8868\u660e\u5176\u5728\u51b3\u7b56\u8fc7\u7a0b\u4e2d\u7684\u91cd\u8981\u6027": 9, "\u675c\u5c14\u585e\u884c\u52a8\u88ab\u63cf\u8ff0\u4e3a\u4e00\u9879\u5df2\u6f14\u53d8\u4e3a\u4e92\u52a8\u548c\u51c6\u5907\u7684\u4efb\u52a1": 9, "\u663e\u793a\u51fa\u76ee\u6807\u548c\u6d3b\u52a8\u7684\u91cd\u5927\u8f6c\u53d8": 9, "\u56e2\u961f": 9, "\u56e2\u961f\u88ab\u63cf\u7ed8\u6210\u4e00\u7fa4\u4ece\u88ab\u52a8\u89c2\u5bdf\u8005\u8f6c\u53d8\u4e3a\u79ef\u6781\u53c2\u4e0e\u8005\u7684\u4eba": 9, "\u5c55\u793a\u4e86\u4ed6\u4eec\u89d2\u8272\u7684\u52a8\u6001\u53d8\u5316": 9, "\u56e2\u961f\u6536\u5230\u6765\u81ea\u534e\u76db\u987f\u7684\u901a\u8baf": 9, "\u8fd9\u5f71\u54cd\u4e86\u4ed6\u4eec\u7684\u51b3\u7b56\u8fc7\u7a0b": 9, "\u51b3\u7b56": 9, "\u5916\u90e8\u5f71\u54cd": 9, "\u56e2\u961f\u76f4\u63a5\u53c2\u4e0e\u675c\u5c14\u585e\u884c\u52a8": 9, "\u6267\u884c\u5176\u6f14\u53d8\u540e\u7684\u76ee\u6807\u548c\u6d3b\u52a8": 9, "\u4efb\u52a1\u6f14\u53d8": 9, "\u79ef\u6781\u53c2\u4e0e": 9, "role": 9, "event": 9, "ntheir": 9, "slice": 9, "through": 9, "illus": 9, "intellig": 9, "liter": 9, "write": 9, "own": 9, "rule": [9, 10], "state": 9, "stoical": 9, "cast": 9, "watch": 9, "over": 9, "flurri": 9, "learn": 9, "commun": 9, "offer": 9, "sam": 9, "rivera": 9, "nearbi": 9, "interfac": 9, "youth": 9, "energi": 9, "bode": 9, "aw": 9, "anxieti": 9, "give": [9, 13], "talk": 9, "stranger": 9, "nalex": 9, "survei": 9, "team": 9, "studi": 9, "concentr": 9, "measur": 9, "trepid": 9, "well": 9, "our": 9, "contact": 9, "he": 9, "acknowledg": 9, "readi": 9, "whatev": 9, "back": 9, "ntogeth": 9, "stood": 9, "unknown": 9, "forg": 9, "human": 9, "heaven": 9, "ensu": 9, "silenc": 9, "palpabl": 9, "collect": 9, "introspect": 9, "about": 9, "grand": 9, "cosmic": 9, "plai": 9, "rewrit": 9, "histori": 9, "encrypt": 9, "dialogu": 9, "continu": 9, "unfold": 9, "intric": 9, "almost": 9, "uncanni": 9, "anticip": 9, "member": 9, "leader": 9, "abil": 9, "govern": 9, "challeng": 9, "capabl": 9, "taken": 9, "involv": 9, "make": 9, "leadership": 9, "explor": 9, "autonomi": 9, "real": 9, "input_text": 9, "default_continue_prompt": 9, "mani": 9, "were": 9, "miss": 9, "same": 9, "default_if_loop_prompt": 9, "appear": 9, "still": 9, "ye": 9, "NO": 9, "default_entity_typ": 9, "geo": 9, "default_tuple_delimit": 9, "default_record_delimit": 9, "default_completion_delimit": 9, "complet": 9, "default_entity_pattern": 9, "default_relation_pattern": 9, "defin": 9, "record": 9, "To": 9, "mark": 9, "end": 9, "num": 9, "llm": 9, "glean": 9, "stop": 9, "add_messag": 9, "light_rag_extract": 9, "extracteventmapp": [9, 13], "event_desc_kei": 9, "__dj__event_description__": 9, "relevant_char_kei": 9, "__dj__relevant_characters__": 9, "\u5bf9\u6587\u672c\u7684\u60c5\u8282\u8fdb\u884c\u5206\u70b9\u603b\u7ed3": 9, "\u5e76\u62bd\u53d6\u4e0e\u60c5\u8282\u76f8\u5173\u7684\u4eba\u7269": 9, "\u5c3d\u91cf\u4e0d\u8981\u9057\u6f0f\u5185\u5bb9": 9, "\u4e0d\u8981\u6dfb\u52a0\u6587\u672c\u4e2d\u6ca1\u6709\u7684\u60c5\u8282": 9, "\u7b26\u5408\u539f\u6587\u4e8b\u5b9e": 9, "\u8054\u7cfb\u4e0a\u4e0b\u6587\u8bf4\u660e\u524d\u56e0\u540e\u679c": 9, "\u4f46\u4ecd\u7136\u9700\u8981\u7b26\u5408\u4e8b\u5b9e": 9, "\u4e0d\u8981\u5305\u542b\u4e3b\u89c2\u770b\u6cd5": 9, "\u6ce8\u610f\u8981\u5c3d\u53ef\u80fd\u4fdd\u7559\u6587\u672c\u7684\u4e13\u6709\u540d\u8bcd": 9, "\u6ce8\u610f\u76f8\u5173\u4eba\u7269\u9700\u8981\u5728\u5bf9\u5e94\u60c5\u8282\u4e2d\u51fa\u73b0": 9, "\u53ea\u62bd\u53d6\u60c5\u8282\u4e2d\u7684\u4e3b\u8981\u4eba\u7269": 9, "\u4e0d\u8981\u9057\u6f0f\u60c5\u8282\u7684\u4e3b\u8981\u4eba\u7269": 9, "\u603b\u7ed3\u683c\u5f0f\u5982\u4e0b": 9, "\u60c5\u82821": 9, "\u60c5\u8282\u63cf\u8ff0": 9, "\u76f8\u5173\u4eba\u7269": 9, "\u4eba\u72691": 9, "\u4eba\u72692": 9, "\u4eba\u72693": 9, "\u60c5\u82822": 9, "\u60c5\u82823": 9, "\u60c5\u8282": 9, "extractkeywordmapp": [9, 13], "keyword_kei": 9, "__dj__keyword__": 9, "topic": 9, "entir": 9, "These": 9, "idea": 9, "present": 9, "content_keyword": 9, "high_level_keyword": 9, "\u51b3\u7b56\u5236\u5b9a": 9, "\u5b87\u5b99\u610f\u4e49": 9, "extractnicknamemapp": [9, 13], "nickname_kei": 9, "__dj__nickname__": 9, "nicknam": 9, "\u7ed9\u5b9a\u4f60\u4e00\u6bb5\u6587\u672c": 9, "\u4f60\u7684\u4efb\u52a1\u662f\u5c06\u4eba\u7269\u4e4b\u95f4\u7684\u79f0\u547c\u65b9\u5f0f": 9, "\u6635\u79f0": 9, "\u63d0\u53d6\u51fa\u6765": 9, "\u9700\u8981\u7ed9\u51fa\u8bf4\u8bdd\u4eba\u5bf9\u88ab\u79f0\u547c\u4eba\u7684\u79f0\u547c": 9, "\u4e0d\u8981\u641e\u53cd\u4e86": 9, "\u76f8\u540c\u7684\u8bf4\u8bdd\u4eba\u548c\u88ab\u79f0\u547c\u4eba\u6700\u591a\u7ed9\u51fa\u4e00\u4e2a\u6700\u5e38\u7528\u7684\u79f0\u547c": 9, "\u8bf7\u4e0d\u8981\u8f93\u51fa\u4e92\u76f8\u6ca1\u6709\u6635\u79f0\u7684\u79f0\u547c\u65b9\u5f0f": 9, "\u8f93\u51fa\u683c\u5f0f\u5982\u4e0b": 9, "\u79f0\u547c\u65b9\u5f0f1": 9, "\u8bf4\u8bdd\u4eba": 9, "\u88ab\u79f0\u547c\u4eba": 9, "\u7684\u6635\u79f0": 9, "\u79f0\u547c\u65b9\u5f0f2": 9, "\u79f0\u547c\u65b9\u5f0f3": 9, "\u79f0\u547c\u65b9\u5f0f": 9, "doubl": 9, "fixunicodemapp": [9, 13], "fix": 9, "unicod": 9, "form": 9, "nfc": 9, "nfkc": 9, "nfd": 9, "nfkd": 9, "generateqafromexamplesmapp": [9, 13], "hf_model": 9, "qwen": 9, "qwen2": 9, "7b": 9, "instruct": 9, "seed_fil": 9, "example_num": 9, "similarity_threshold": 9, "example_templ": 9, "enable_vllm": 9, "your": 9, "\u8bf7\u4f60\u4ed4\u7ec6\u89c2\u5bdf\u591a\u4e2a\u793a\u4f8b\u6570\u636e\u7684\u8f93\u5165\u548c\u8f93\u51fa": 9, "\u6309\u7167\u4f60\u7684\u7406\u89e3": 9, "\u603b\u7ed3\u51fa\u76f8\u5e94\u89c4\u77e9": 9, "\u7136\u540e\u5199\u51fa\u4e00\u4e2a\u65b0\u7684": 9, "\u6ce8\u610f": 9, "\u65b0\u751f\u6210\u7684": 9, "\u9700\u8981\u6ee1\u8db3\u5982\u4e0b\u8981\u6c42": 9, "\u751f\u6210\u7684": 9, "\u4e0d\u80fd\u4e0e\u8f93\u5165\u7684": 9, "\u4e00\u81f4": 9, "\u4f46\u662f\u9700\u8981\u4fdd\u6301\u683c\u5f0f\u76f8\u540c": 9, "\u4e0d\u4e00\u5b9a\u8981\u5c40\u9650\u4e8e\u8f93\u5165": 9, "\u7684\u8bdd\u9898\u6216\u9886\u57df": 9, "\u9700\u8981\u6b63\u786e\u56de\u7b54\u751f\u6210\u7684": 9, "\u63d0\u4f9b\u7684": 9, "\u53ef\u80fd\u662f\u591a\u8f6e\u5bf9\u8bdd": 9, "\u4e5f\u53ef\u4ee5\u662f\u591a\u8f6e": 9, "\u5fc5\u987b\u6210\u5bf9\u51fa\u73b0": 9, "\u800c\u4e14": 9, "\u9700\u8981\u5728": 9, "\u4e4b\u524d": 9, "default_example_templ": 9, "n\u5982\u4e0b\u662f\u4e00\u6761\u793a\u4f8b\u6570\u636e": 9, "hugginfac": 9, "id": 9, "chatml": 9, "put": 9, "qa": 9, "guid": 9, "placehold": 9, "vllm": 9, "infer": 9, "acceler": 9, "qa_exampl": 9, "generateqafromtextmapp": [9, 13], "alibaba": 9, "pai": 9, "qwen1_5": 9, "doc2qa": 9, "llama3": 9, "8b": 9, "baichuan2": 9, "4b": 9, "1b8": 9, "0b5": 9, "suitabl": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "ulaanbaatar": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "reykjavik": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "assist": 9, "\u4f60\u597d": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "imageblurmapp": [9, 13], "p": 9, "blur_typ": 9, "gaussian": 9, "radiu": 9, "blur": 9, "blure": 9, "kernel": 9, "imagecaptioningfromgpt4vmapp": [9, 13], "api_kei": 9, "max_token": 9, "user_prompt": 9, "user_prompt_kei": 9, "keep_original_sampl": 9, "visison": 9, "reson": 9, "convers": 9, "custom": 9, "authent": 9, "guidanc": [9, 13], "gpt4": 9, "uers_prompt_kei": 9, "imagecaptioningmapp": [9, 13], "hf_img2seq": 9, "blip2": 9, "opt": 9, "caption_num": 9, "keep_candidate_mod": 9, "random_ani": 9, "prompt_kei": 9, "caption": 9, "anoth": 9, "how": 9, "candid": 9, "similar_one_simhash": 9, "batched_op": 9, "both": [9, 10], "suppos": 9, "b": 9, "denot": 9, "2nb": 9, "nb": 9, "mnb": 9, "similar_on": 9, "imagediffusionmapp": [9, 13], "hf_diffus": 9, "compvi": 9, "stabl": 9, "diffus": 9, "v1": 9, "torch_dtyp": 9, "fp32": 9, "revis": 9, "guidance_scal": 9, "aug_num": 9, "caption_kei": 9, "point": 9, "fp16": 9, "bf16": 9, "branch": 9, "git": 9, "extent": 9, "start": 9, "nois": 9, "higher": 9, "denois": 9, "amount": 9, "num_inference_step": 9, "essenti": 9, "scale": 9, "encourag": 9, "close": 9, "expens": 9, "qualiti": 9, "produc": 9, "otherwis": 9, "imagefaceblurmapp": [9, 13], "imagetaggingmapp": [9, 13], "__dj__image_tags__": 9, "nlpaugenmapp": [9, 13], "sequenti": 9, "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "delete_random_char": 9, "swap_random_char": 9, "insert_random_char": 9, "simpli": 9, "nlpaug": 9, "librari": 9, "semant": 9, "significantli": 9, "combin": 9, "would": 9, "opened_aug_method": 9, "delet": 9, "love": 9, "swap": 9, "contigu": 9, "simul": 9, "spell": 9, "ll": 9, "keyboard": 9, "ov4": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "nlpcdazhmapp": [9, 13], "replace_similar_word": 9, "replace_homophone_char": 9, "replace_equivalent_num": 9, "nlpcda": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "optimizeqamapp": [9, 13], "\u8bf7\u4f18\u5316\u8f93\u5165\u7684\u95ee\u7b54\u5bf9": 9, "\u4f7f": 9, "\u90fd\u66f4\u52a0\u8be6\u7ec6": 9, "\u5fc5\u987b\u6309\u7167\u4ee5\u4e0b\u6807\u8bb0\u683c\u5f0f": 9, "\u76f4\u63a5\u8f93\u51fa\u4f18\u5316\u540e\u7684\u95ee\u7b54\u5bf9": 9, "n\u4f18\u5316\u540e\u7684\u95ee\u9898": 9, "n\u4f18\u5316\u540e\u7684\u56de\u7b54": 9, "\u4ee5\u4e0b\u662f\u539f\u59cb\u95ee\u7b54\u5bf9": 9, "sure": 9, "optimizequerymapp": [9, 13], "\u4f18\u5316\u95ee\u7b54\u5bf9\u4e2d\u7684": 9, "\u5c06\u5176\u66f4\u52a0\u8be6\u7ec6\u5177\u4f53": 9, "\u4f46\u4ecd\u53ef\u4ee5\u7531\u539f\u7b54\u6848\u56de\u7b54": 9, "\u53ea\u8f93\u51fa\u4f18\u5316\u540e\u7684": 9, "optimizeresponsemapp": [9, 13], "\u8bf7\u4f18\u5316\u95ee\u7b54\u5bf9\u4e2d\u7684\u56de\u7b54": 9, "\u4f46\u4ecd\u53ef\u4ee5\u56de\u7b54\u539f\u95ee\u9898": 9, "\u53ea\u8f93\u51fa\u4f18\u5316\u540e\u7684\u56de\u7b54": 9, "pairpreferencemapp": [9, 13], "rejected_kei": 9, "rejected_respons": 9, "reason_kei": 9, "reason": 9, "prefer": 9, "\u4f60\u7684\u4efb\u52a1\u662f\u6839\u636e\u53c2\u8003\u4fe1\u606f\u4fee\u6539\u95ee\u7b54\u5bf9\u4e2d\u7684\u56de\u7b54": 9, "\u5728\u8bed\u8a00\u98ce\u683c": 9, "\u4e8b\u5b9e\u6027": 9, "\u4eba\u7269\u8eab\u4efd": 9, "\u7acb\u573a\u7b49\u4efb\u4e00\u65b9\u9762\u4e0e\u539f\u56de\u7b54\u76f8\u53cd": 9, "\u5fc5\u987b\u6309\u7167\u4ee5\u4e0b\u6807\u8bb0\u683c\u5f0f\u8f93\u51fa": 9, "\u4e0d\u8981\u8f93\u51fa\u5176\u4ed6\u591a\u4f59\u5185\u5bb9": 9, "n\u751f\u6210\u7684\u65b0\u56de\u7b54": 9, "\u539f\u56e0": 9, "n\u751f\u6210\u8be5\u56de\u7b54\u7684\u539f\u56e0": 9, "n\u4ee5\u4e0b\u662f\u539f\u59cb\u95ee\u7b54\u5bf9": 9, "repons": 9, "reject": 9, "failur": 9, "punctuationnormalizationmapp": [9, 13], "removebibliographymapp": [9, 13], "bibliographi": 9, "removecommentsmapp": [9, 13], "doc_typ": 9, "inlin": 9, "multilin": 9, "removeheadermapp": [9, 13], "drop_no_head": 9, "header": 9, "removelongwordsmapp": [9, 13], "long": 9, "should_keep_long_word": 9, "removenonchinesecharacterlmapp": [9, 13], "keep_alphabet": 9, "keep_numb": 9, "keep_punc": 9, "removerepeatsentencesmapp": [9, 13], "ignore_special_charact": 9, "min_repeat_sentence_length": 9, "repeat": 9, "judg": 9, "letter": 9, "removespecificcharsmapp": [9, 13], "chars_to_remov": 9, "removetabletextmapp": [9, 13], "min_col": 9, "max_col": 9, "20": 9, "removewordswithincorrectsubstringsmapp": [9, 13], "substr": 9, "incorrect": 9, "should_keep_word_with_incorrect_substr": 9, "replacecontentmapp": [9, 13], "design": 9, "sentencesplitmapp": [9, 13], "textchunkmapp": [9, 13], "split_pattern": 9, "overlap_len": 9, "len": 9, "forc": 9, "cut": 9, "offerd": 9, "tiktoken": 9, "dashscop": 9, "72b": 9, "recursively_chunk": 9, "get_text_chunk": 9, "videocaptioningfromaudiomapp": [9, 13], "stream": 9, "videocaptioningfromframesmapp": [9, 13], "videocaptioningfromsummarizermapp": [9, 13], "hf_summar": 9, "consider_video_caption_from_video": 9, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "vid_cap_from_vid_arg": 9, "vid_cap_from_frm_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "keep_tag_num": 9, "too": 9, "bring": 9, "frequent": 9, "videocaptioningfromvideomapp": [9, 13], "hf_video_blip": 9, "kpyu": 9, "ego4d": 9, "videoffmpegwrappedmapp": [9, 13], "videofaceblurmapp": [9, 13], "videoremovewatermarkmapp": [9, 13], "roi_str": 9, "roi_typ": 9, "roi_kei": 9, "min_frame_threshold": 9, "detection_method": 9, "pixel_valu": 9, "region": 9, "x1": 9, "y1": 9, "x2": 9, "y2": 9, "roi": 9, "pixel": 9, "corner": 9, "coordin": 9, "wight": 9, "coodin": 9, "pixel_divers": 9, "useless": 9, "videoresizeaspectratiomapp": [9, 13], "increas": 9, "decreas": 9, "enforc": 9, "adjust": 9, "either": 9, "enlarg": 9, "accept": 9, "videoresizeresolutionmapp": [9, 13], "force_original_aspect_ratio": 9, "disabl": 9, "force_divisible_bi": 9, "leav": 9, "super": 9, "deep": 9, "futur": 9, "necessari": 9, "ensur": 9, "integ": 9, "even": 9, "videosplitbydurationmapp": [9, 13], "split_dur": 9, "min_last_split_dur": 9, "discard": 9, "split_videos_by_dur": 9, "videosplitbykeyframemapp": [9, 13], "get_split_key_fram": 9, "videosplitbyscenemapp": [9, 13], "detector": 9, "contentdetector": 9, "27": 9, "min_scene_len": 9, "15": 9, "show_progress": 9, "scene": 9, "avaliable_detector": 9, "adaptivedetector": 9, "window_width": 9, "min_content_v": 9, "luma_onli": 9, "kernel_s": 9, "video_manag": 9, "min_delta_hsv": 9, "thresholddetector": 9, "fade_bia": 9, "add_final_scen": 9, "block_siz": 9, "scenedetect": 9, "progress": 9, "videotaggingfromaudiomapp": [9, 13], "hf_ast": 9, "mit": 9, "ast": 9, "finetun": 9, "audioset": 9, "4593": 9, "__dj__video_audio_tags__": 9, "spectrogram": 9, "hf": 9, "trust": 9, "videotaggingfromframesmapp": [9, 13], "whitespacenormalizationmapp": [9, 13], "0x20": 9, "wikipedia": 9, "wiki": 9, "whitespace_charact": 9, "frequencyspecifiedfieldselector": [10, 13], "top_ratio": 10, "topk": 10, "sort": 10, "frequenc": 10, "descend": 10, "randomselector": [10, 13], "select_ratio": 10, "select_num": 10, "rangespecifiedfieldselector": [10, 13], "lower_percentil": 10, "upper_percentil": 10, "lower_rank": 10, "upper_rank": 10, "smallest": 10, "bound": 10, "upper": 10, "topkspecifiedfieldselector": [10, 13], "kdd": 13, "24": 13, "modal": 13, "foundat": 13, "practic": 13, "data_juic": 13, "core": 13, "index": 13, "page": 13}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [3, 0, 0, "-", "core"], [0, 3, 1, "", "cuda_device_count"], [4, 0, 0, "-", "format"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "get_init_configs"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"], [2, 3, 1, "", "prepare_side_configs"]], "data_juicer.core": [[3, 1, 1, "", "Adapter"], [3, 1, 1, "", "Analyzer"], [3, 1, 1, "", "Executor"], [3, 1, 1, "", "Exporter"], [3, 1, 1, "", "Monitor"], [3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "Tracer"]], "data_juicer.core.Adapter": [[3, 4, 1, "", "MAX_BATCH_SIZE"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "adapt_workloads"], [3, 2, 1, "", "batch_size_strategy"], [3, 2, 1, "", "execute_and_probe"], [3, 2, 1, "", "probe_small_batch"], [3, 2, 1, "", "take_batch"]], "data_juicer.core.Analyzer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"], [3, 2, 1, "", "sample_data"]], "data_juicer.core.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "export_compute_stats"], [3, 2, 1, "", "to_json"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.Monitor": [[3, 4, 1, "", "DYNAMIC_FIELDS"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "analyze_resource_util_list"], [3, 2, 1, "", "analyze_single_resource_util"], [3, 2, 1, "", "draw_resource_util_graph"], [3, 2, 1, "", "monitor_all_resources"], [3, 2, 1, "", "monitor_current_resources"], [3, 2, 1, "", "monitor_func"]], "data_juicer.core.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "load_from_disk"], [3, 2, 1, "", "map"], [3, 2, 1, "", "process"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 1, 1, "", "CsvFormatter"], [4, 1, 1, "", "EmptyFormatter"], [4, 1, 1, "", "JsonFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "MixtureFormatter"], [4, 1, 1, "", "ParquetFormatter"], [4, 1, 1, "", "RayEmptyFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 1, 1, "", "TextFormatter"], [4, 1, 1, "", "TsvFormatter"], [4, 3, 1, "", "load_formatter"]], "data_juicer.format.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.EmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 2, 1, "", "random_sample"]], "data_juicer.format.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.RayEmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats_batched"], [5, 2, 1, "", "compute_stats_single"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "split_text_by_punctuation"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats_single"], [7, 2, 1, "", "process_single"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceCountFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImagePairSimilarityFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoMotionScoreRaftFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceCountFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImagePairSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_flow"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"], [8, 2, 1, "", "setup_model"]], "data_juicer.ops.filter.VideoMotionScoreRaftFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_flow"], [8, 2, 1, "", "setup_model"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "CalibrateQAMapper"], [9, 1, 1, "", "CalibrateQueryMapper"], [9, 1, 1, "", "CalibrateResponseMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "ExtractEntityAttributeMapper"], [9, 1, 1, "", "ExtractEntityRelationMapper"], [9, 1, 1, "", "ExtractEventMapper"], [9, 1, 1, "", "ExtractKeywordMapper"], [9, 1, 1, "", "ExtractNicknameMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "GenerateQAFromExamplesMapper"], [9, 1, 1, "", "GenerateQAFromTextMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "ImageTaggingMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "OptimizeQAMapper"], [9, 1, 1, "", "OptimizeQueryMapper"], [9, 1, 1, "", "OptimizeResponseMapper"], [9, 1, 1, "", "PairPreferenceMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "TextChunkMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.CalibrateQAMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_REFERENCE_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.CalibrateQueryMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.CalibrateResponseMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractEntityAttributeMapper": [[9, 4, 1, "", "DEFAULT_ATTR_PATTERN_TEMPLATE"], [9, 4, 1, "", "DEFAULT_DEMON_PATTERN"], [9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT_TEMPLATE"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractEntityRelationMapper": [[9, 4, 1, "", "DEFAULT_COMPLETION_DELIMITER"], [9, 4, 1, "", "DEFAULT_CONTINUE_PROMPT"], [9, 4, 1, "", "DEFAULT_ENTITY_PATTERN"], [9, 4, 1, "", "DEFAULT_ENTITY_TYPES"], [9, 4, 1, "", "DEFAULT_IF_LOOP_PROMPT"], [9, 4, 1, "", "DEFAULT_PROMPT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_RECORD_DELIMITER"], [9, 4, 1, "", "DEFAULT_RELATION_PATTERN"], [9, 4, 1, "", "DEFAULT_TUPLE_DELIMITER"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "add_message"], [9, 2, 1, "", "light_rag_extraction"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ExtractEventMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractKeywordMapper": [[9, 4, 1, "", "DEFAULT_COMPLETION_DELIMITER"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_PROMPT_TEMPLATE"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ExtractNicknameMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.GenerateQAFromExamplesMapper": [[9, 4, 1, "", "DEFAULT_EXAMPLE_TEMPLATE"], [9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.GenerateQAFromTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageTaggingMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.OptimizeQAMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.OptimizeQueryMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.OptimizeResponseMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.PairPreferenceMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.TextChunkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_text_chunks"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "recursively_chunk"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "RandomSelector"], [10, 1, 1, "", "RangeSpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RandomSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RangeSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute", "5": "py:property"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"], "5": ["py", "property", "Python property"]}, "titleterms": {"data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "analysi": 1, "config": 2, "core": 3, "format": 4, "op": [5, 6, 7, 8, 9, 10], "common": 6, "dedupl": 7, "filter": 8, "mapper": 9, "selector": 10, "tool": 11, "util": 12, "welcom": 13, "data": 13, "juicer": 13, "": 13, "document": 13, "tutori": 13, "api": 13, "refer": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"data_juicer": [[0, "module-data_juicer"], [14, "data-juicer"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "}": [[3, "id1"], [3, "id2"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "Tutorial": [[13, "tutorial"]], "API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]]}, "indexentries": {"cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count"]], "data_juicer": [[0, "module-data_juicer"]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available"]], "module": [[0, "module-data_juicer"], [1, "module-data_juicer.analysis"], [2, "module-data_juicer.config"], [3, "module-data_juicer.core"], [4, "module-data_juicer.format"], [5, "module-data_juicer.ops"], [6, "module-data_juicer.ops.common"], [7, "module-data_juicer.ops.deduplicator"], [8, "module-data_juicer.ops.filter"], [9, "module-data_juicer.ops.mapper"], [10, "module-data_juicer.ops.selector"], [11, "module-data_juicer.tools"], [12, "module-data_juicer.utils"]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__"]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze"]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze"]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze"]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist"]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config"]], "get_init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.get_init_configs"]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs"]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config"]], "prepare_side_configs() (in module data_juicer.config)": [[2, "data_juicer.config.prepare_side_configs"]], "adapter (class in data_juicer.core)": [[3, "data_juicer.core.Adapter"]], "analyzer (class in data_juicer.core)": [[3, "data_juicer.core.Analyzer"]], "dynamic_fields (data_juicer.core.monitor attribute)": [[3, "data_juicer.core.Monitor.DYNAMIC_FIELDS"]], "executor (class in data_juicer.core)": [[3, "data_juicer.core.Executor"]], "exporter (class in data_juicer.core)": [[3, "data_juicer.core.Exporter"]], "gib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.GiB"]], "kib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.KiB"]], "max_batch_size (data_juicer.core.adapter attribute)": [[3, "data_juicer.core.Adapter.MAX_BATCH_SIZE"]], "mib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.MiB"]], "monitor (class in data_juicer.core)": [[3, "data_juicer.core.Monitor"]], "nesteddataset (class in data_juicer.core)": [[3, "data_juicer.core.NestedDataset"]], "tib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.TiB"]], "tracer (class in data_juicer.core)": [[3, "data_juicer.core.Tracer"]], "__init__() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.__init__"]], "__init__() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.__init__"]], "__init__() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.__init__"]], "__init__() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.__init__"]], "__init__() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.__init__"]], "__init__() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.__init__"]], "__init__() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.__init__"]], "adapt_workloads() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.adapt_workloads"]], "add_column() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.add_column"]], "analyze_resource_util_list() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_resource_util_list"]], "analyze_single_resource_util() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_single_resource_util"]], "batch_size_strategy() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.batch_size_strategy"]], "cleanup_cache_files() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.cleanup_cache_files"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "draw_resource_util_graph() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.draw_resource_util_graph"]], "execute_and_probe() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.execute_and_probe"]], "export() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export"]], "export_compute_stats() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export_compute_stats"]], "filter() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.filter"]], "from_dict() (data_juicer.core.nesteddataset class method)": [[3, "data_juicer.core.NestedDataset.from_dict"]], "load_from_disk() (data_juicer.core.nesteddataset static method)": [[3, "data_juicer.core.NestedDataset.load_from_disk"]], "map() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.map"]], "monitor_all_resources() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.monitor_all_resources"]], "monitor_current_resources() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_current_resources"]], "monitor_func() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_func"]], "probe_small_batch() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.probe_small_batch"]], "process() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.process"]], "remove_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.run"]], "run() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.run"]], "sample_data() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.sample_data"]], "select() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select"]], "select_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select_columns"]], "take_batch() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.take_batch"]], "to_json() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_json"]], "to_jsonl() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_mapper"]], "csvformatter (class in data_juicer.format)": [[4, "data_juicer.format.CsvFormatter"]], "emptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.EmptyFormatter"]], "jsonformatter (class in data_juicer.format)": [[4, "data_juicer.format.JsonFormatter"]], "localformatter (class in data_juicer.format)": [[4, "data_juicer.format.LocalFormatter"]], "mixtureformatter (class in data_juicer.format)": [[4, "data_juicer.format.MixtureFormatter"]], "parquetformatter (class in data_juicer.format)": [[4, "data_juicer.format.ParquetFormatter"]], "rayemptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.RayEmptyFormatter"]], "remoteformatter (class in data_juicer.format)": [[4, "data_juicer.format.RemoteFormatter"]], "suffixes (data_juicer.format.csvformatter attribute)": [[4, "data_juicer.format.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.emptyformatter attribute)": [[4, "data_juicer.format.EmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.jsonformatter attribute)": [[4, "data_juicer.format.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquetformatter attribute)": [[4, "data_juicer.format.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.rayemptyformatter attribute)": [[4, "data_juicer.format.RayEmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.textformatter attribute)": [[4, "data_juicer.format.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsvformatter attribute)": [[4, "data_juicer.format.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format)": [[4, "data_juicer.format.TextFormatter"]], "tsvformatter (class in data_juicer.format)": [[4, "data_juicer.format.TsvFormatter"]], "__init__() (data_juicer.format.csvformatter method)": [[4, "data_juicer.format.CsvFormatter.__init__"]], "__init__() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.__init__"]], "__init__() (data_juicer.format.jsonformatter method)": [[4, "data_juicer.format.JsonFormatter.__init__"]], "__init__() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.__init__"]], "__init__() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.__init__"]], "__init__() (data_juicer.format.parquetformatter method)": [[4, "data_juicer.format.ParquetFormatter.__init__"]], "__init__() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.__init__"]], "__init__() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.__init__"]], "__init__() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.__init__"]], "__init__() (data_juicer.format.tsvformatter method)": [[4, "data_juicer.format.TsvFormatter.__init__"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "load_dataset() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format)": [[4, "data_juicer.format.load_formatter"]], "null_value (data_juicer.format.emptyformatter property)": [[4, "data_juicer.format.EmptyFormatter.null_value"]], "null_value (data_juicer.format.rayemptyformatter property)": [[4, "data_juicer.format.RayEmptyFormatter.null_value"]], "random_sample() (data_juicer.format.mixtureformatter class method)": [[4, "data_juicer.format.MixtureFormatter.random_sample"]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator"]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter"]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper"]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector"]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__"]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__"]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__"]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__"]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash"]], "compute_stats_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_single"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops"]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process"]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process"]], "process_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_batched"]], "process_batched() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_batched"]], "process_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_single"]], "process_single() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_single"]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run"]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run"]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run"]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace"]], "split_text_by_punctuation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_text_by_punctuation"]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip"]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator"]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE"]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator"]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator"]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator"]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator"]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator"]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__"]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash"]], "compute_stats_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats_single"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process"]], "process_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process_single"]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter"]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter"]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter"]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter"]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter"]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter"]], "imagefacecountfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter"]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter"]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter"]], "imagepairsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter"]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter"]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter"]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter"]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter"]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter"]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter"]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter"]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter"]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter"]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter"]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter"]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter"]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter"]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter"]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter"]], "videomotionscoreraftfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter"]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter"]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter"]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter"]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter"]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter"]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter"]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__"]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__"]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__"]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscoreraftfilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter.__init__"]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__"]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__"]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__"]], "compute_flow() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_flow"]], "compute_flow() (data_juicer.ops.filter.videomotionscoreraftfilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter.compute_flow"]], "compute_stats_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats_single"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader"]], "process_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process_batched"]], "process_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process_single"]], "process_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process_single"]], "process_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process_single"]], "process_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process_single"]], "process_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process_single"]], "process_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process_single"]], "process_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process_single"]], "process_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process_single"]], "process_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process_single"]], "setup_model() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.setup_model"]], "setup_model() (data_juicer.ops.filter.videomotionscoreraftfilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter.setup_model"]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper"]], "calibrateqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper"]], "calibratequerymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CalibrateQueryMapper"]], "calibrateresponsemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CalibrateResponseMapper"]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper"]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper"]], "default_attr_pattern_template (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_ATTR_PATTERN_TEMPLATE"]], "default_completion_delimiter (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_COMPLETION_DELIMITER"]], "default_completion_delimiter (data_juicer.ops.mapper.extractkeywordmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.DEFAULT_COMPLETION_DELIMITER"]], "default_continue_prompt (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_CONTINUE_PROMPT"]], "default_demon_pattern (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_DEMON_PATTERN"]], "default_entity_pattern (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_ENTITY_PATTERN"]], "default_entity_types (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_ENTITY_TYPES"]], "default_example_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_EXAMPLE_TEMPLATE"]], "default_if_loop_prompt (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_IF_LOOP_PROMPT"]], "default_input_template (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.extracteventmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.extractnicknamemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.pairpreferencemapper attribute)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.DEFAULT_INPUT_TEMPLATE"]], "default_output_pattern (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.extracteventmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.extractkeywordmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.extractnicknamemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.pairpreferencemapper attribute)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.DEFAULT_OUTPUT_PATTERN"]], "default_prompt_template (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_PROMPT_TEMPLATE"]], "default_prompt_template (data_juicer.ops.mapper.extractkeywordmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.DEFAULT_PROMPT_TEMPLATE"]], "default_qa_pair_template (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_QA_PAIR_TEMPLATE"]], "default_qa_pair_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_QA_PAIR_TEMPLATE"]], "default_qa_pair_template (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_QA_PAIR_TEMPLATE"]], "default_record_delimiter (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_RECORD_DELIMITER"]], "default_reference_template (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_REFERENCE_TEMPLATE"]], "default_relation_pattern (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_RELATION_PATTERN"]], "default_system_prompt (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.calibratequerymapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQueryMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.calibrateresponsemapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateResponseMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.extracteventmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.extractnicknamemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizequerymapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizeresponsemapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.pairpreferencemapper attribute)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt_template (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_SYSTEM_PROMPT_TEMPLATE"]], "default_tuple_delimiter (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_TUPLE_DELIMITER"]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper"]], "extractentityattributemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper"]], "extractentityrelationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper"]], "extracteventmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractEventMapper"]], "extractkeywordmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper"]], "extractnicknamemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper"]], "generateqafromexamplesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper"]], "generateqafromtextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper"]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper"]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper"]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper"]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper"]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper"]], "imagetaggingmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper"]], "optimizeqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper"]], "optimizequerymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper"]], "optimizeresponsemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper"]], "pairpreferencemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper"]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper"]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper"]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY"]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper"]], "textchunkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.TextChunkMapper"]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper"]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper"]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper"]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper"]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper"]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper"]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper"]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper"]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper"]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper"]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper"]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper"]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper"]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractentityattributemapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extracteventmapper method)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractkeywordmapper method)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractnicknamemapper method)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__"]], "add_message() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.add_message"]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors"]], "build_input() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.build_input"]], "build_input() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.build_input"]], "build_input() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.build_input"]], "build_input() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.build_input"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame"]], "get_text_chunks() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.get_text_chunks"]], "light_rag_extraction() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.light_rag_extraction"]], "parse_output() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.calibratequerymapper method)": [[9, "data_juicer.ops.mapper.CalibrateQueryMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.calibrateresponsemapper method)": [[9, "data_juicer.ops.mapper.CalibrateResponseMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extractentityattributemapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extracteventmapper method)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extractkeywordmapper method)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extractnicknamemapper method)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizequerymapper method)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizeresponsemapper method)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.parse_output"]], "process_batched() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.extractentityattributemapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.extracteventmapper method)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process_batched"]], "process_single() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.process_single"]], "process_single() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.process_single"]], "process_single() (data_juicer.ops.mapper.extractkeywordmapper method)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.process_single"]], "process_single() (data_juicer.ops.mapper.extractnicknamemapper method)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.process_single"]], "process_single() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.process_single"]], "process_single() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.process_single"]], "process_single() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process_single"]], "recursively_chunk() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.recursively_chunk"]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector"]], "randomselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RandomSelector"]], "rangespecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.__init__"]], "__init__() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.process"]], "process() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"cuda_device_count": [0, 14], "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "is_cuda_avail": [0, 14], "class": [1, 3, 4, 5, 7, 8, 9, 10], "columnwiseanalysi": [1, 3, 13], "dataset": [1, 3, 4, 5, 7, 8, 9, 10], "output_path": 1, "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "save_stats_in_one_fil": 1, "true": [1, 2, 3, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 7, 8, 9, 10], "object": [1, 2, 3, 8], "appli": [1, 3, 7, 9, 10], "each": [1, 3, 5, 7, 9], "column": [1, 3, 9], "stat": [1, 3, 5, 7, 8], "respect": [1, 9], "__init__": [1, 3, 4, 5, 7, 8, 9, 10], "initi": [1, 2, 3, 4, 7, 8, 9, 10], "method": [1, 3, 4, 6, 7, 8, 9, 10], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "analyz": [1, 2, 3, 13], "path": [1, 2, 3, 4, 7, 8, 9], "store": [1, 3, 4, 5, 7, 8, 9], "result": [1, 3, 8], "option": [1, 3, 4, 9], "precomput": 1, "overal": 1, "whether": [1, 2, 3, 4, 5, 6, 7, 8, 9], "save": [1, 2, 3], "all": [1, 3, 6, 8, 9], "figur": [1, 3, 9], "one": [1, 2, 6, 7, 8, 9], "imag": [1, 5, 7, 8, 9], "file": [1, 2, 3, 4, 5, 8, 9], "show_percentil": 1, "fals": [1, 2, 3, 4, 5, 6, 7, 8, 9], "show": [1, 3, 9], "skip_export": [1, 3], "draw": 1, "percentil": [1, 10], "line": [1, 2, 8, 9], "sub": [1, 6, 7], "If": [1, 3, 7, 8, 9], "": [1, 3, 7, 8, 9], "sever": [1, 3, 9], "red": 1, "indic": [1, 9], "quantil": 1, "distribut": [1, 3, 9], "singl": [1, 3, 9], "window": [1, 7], "after": [1, 3, 6, 7, 8, 9], "disk": [1, 3], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "draw_hist": 1, "ax": 1, "data": [1, 3, 4, 5, 8, 9], "save_path": 1, "histogram": 1, "includ": [1, 3, 7, 8, 9], "inform": [1, 3, 5, 7, 8, 9, 10], "draw_box": 1, "box": [1, 9], "plot": 1, "diversityanalysi": [1, 13], "lang_or_model": 1, "en": [1, 6, 8, 9], "divers": [1, 9], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "get": [1, 6], "an": [1, 3, 4, 5, 7, 8, 9], "param": [1, 2, 4, 6, 7, 9], "model": [1, 6, 7, 8, 9, 13], "specif": [1, 3, 5, 7, 8, 9], "languag": [1, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 13], "load": [1, 3, 4, 5, 9], "comput": [1, 3, 5, 6, 7, 8], "column_nam": 1, "text": [1, 4, 5, 6, 7, 8, 9], "lexic": 1, "tree": [1, 8], "name": [1, 3, 4, 5, 8, 9], "postproc_func": 1, "function": [1, 6, 7], "get_divers": 1, "postproc_kwarg": 1, "whole": [1, 8, 9], "In": [1, 3], "default": [1, 2, 3, 4, 7, 8, 9], "argument": [1, 3, 5, 8, 9], "overallanalysi": [1, 3, 13], "mean": [1, 3, 9], "std": 1, "etc": [1, 3, 4], "refine_single_column": 1, "col": 1, "num_proc": [1, 3, 4], "1": [1, 3, 4, 8, 9], "describ": 1, "panda": 1, "number": [1, 3, 4, 5, 7, 8, 9, 10], "process": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13], "export": [1, 3, 4, 5, 13], "init_config": [2, 13], "arg": [2, 3, 4, 5, 7, 8, 9, 10], "list": [2, 3, 4, 5, 6, 8, 9], "str": [2, 3, 4, 6, 7, 8, 9, 10], "jsonargpars": [2, 3], "parser": 2, "pars": [2, 9], "from": [2, 3, 4, 5, 6, 7, 8, 9, 10], "posix": 2, "style": 2, "command": [2, 4, 9], "yaml": [2, 9], "json": [2, 3, 4, 8], "jsonnet": 2, "superset": 2, "environ": [2, 3], "variabl": 2, "hard": 2, "code": [2, 9], "e": [2, 3, 4, 8, 9], "g": [2, 3, 4, 9], "conifg": 2, "cfg": [2, 3, 4], "defaut": 2, "global": [2, 4, 9], "executor": [2, 3, 13], "get_init_config": [2, 13], "namespac": [2, 3], "dict": [2, 3, 9], "set": [2, 3, 6, 8, 9, 10], "init": 2, "datajuc": 2, "export_config": [2, 13], "format": [2, 3, 8, 9, 13], "skip_non": 2, "bool": [2, 3, 7, 8, 9, 10], "skip_check": 2, "overwrit": [2, 9], "multifil": 2, "some": [2, 9], "ar": [2, 3, 6, 7, 8, 9, 10], "type": [2, 3, 4, 9], "json_ind": 2, "parser_mod": 2, "exclud": 2, "entri": 2, "whose": [2, 8, 9], "valu": [2, 3, 5, 7, 8, 9, 10], "i": [2, 3, 4, 5, 6, 7, 8, 9], "skip": [2, 3], "check": [2, 9], "exist": 2, "multipl": [2, 3, 4, 6, 7, 8], "__path__": 2, "meta": [2, 4], "merge_config": [2, 13], "ori_cfg": 2, "new_cfg": 2, "merg": [2, 4, 6, 8], "configur": [2, 3, 4, 9], "origin": [2, 3, 8, 9], "expect": [2, 3, 9], "cfg_after_merg": 2, "prepare_side_config": [2, 13], "ori_config": 2, "string": [2, 7, 8, 9], "yml": 2, "adapt": [3, 13], "max_batch_s": 3, "10000": 3, "static": 3, "execute_and_prob": 3, "oper": [3, 8], "sample_interv": 3, "0": [3, 4, 5, 7, 8, 9], "5": [3, 7, 8, 9], "input": [3, 5, 7, 8, 9, 10], "probe": 3, "relat": [3, 8, 9], "op": [3, 13], "specifi": [3, 4, 6, 8, 9, 10], "For": [3, 5, 7, 8, 9], "now": [3, 6, 9], "we": [3, 4, 7, 8, 9, 13], "support": [3, 8, 9], "follow": [3, 9], "target": [3, 8, 9, 10], "resourc": 3, "util": [3, 8], "speed": 3, "averag": [3, 8], "The": [3, 4, 5, 8, 9, 10], "item": [3, 5, 9], "take_batch": 3, "config": [3, 5, 9, 13], "split": [3, 6, 9], "batch": [3, 9], "factor": 3, "size": [3, 6, 7, 8, 9], "iter": [3, 8, 9], "adapt_workload": 3, "manag": [3, 9], "schedul": 3, "balanc": 3, "need": [3, 6, 8, 9, 10], "recip": 3, "probe_small_batch": 3, "perform": 3, "small": [3, 8, 9], "pre": [3, 9], "execut": 3, "avail": [3, 8], "current": 3, "estim": 3, "rank": [3, 8, 9, 10], "notic": [3, 9], "should": [3, 7, 8, 9], "run": [3, 5, 8, 9], "cach": [3, 8], "enabl": [3, 9], "A": [3, 5, 7, 9], "length": [3, 4, 8, 9], "batch_size_strategi": 3, "load_analysis_r": 3, "base_b": 3, "util_th": 3, "9": [3, 8, 9], "decid": [3, 5, 7, 8], "accord": [3, 4, 5, 8, 9], "workload": 3, "analysi": [3, 13], "threshold": [3, 7, 8, 9], "guarante": 3, "won": [3, 7], "t": [3, 4, 6, 7], "exce": [3, 8, 9], "onli": [3, 7, 8, 9], "consid": [3, 7, 8, 9], "bucket": 3, "effect": 3, "which": [3, 5, 7, 8, 9], "max": [3, 4, 7, 8, 9], "except": [3, 9], "gpu": 3, "thi": [3, 4, 5, 6, 7, 8, 9, 10], "It": [3, 4, 7, 8, 9], "filter": [3, 5, 7, 9, 13], "gener": [3, 9], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "better": [3, 8], "load_data_np": 3, "int": [3, 4, 7, 8, 9, 10], "skip_return": 3, "pipelin": 3, "worker": 3, "when": [3, 4, 5, 7, 8, 9, 10], "api": [3, 9], "call": [3, 9], "nesteddataset": [3, 13], "karg": 3, "djdataset": 3, "enhanc": 3, "huggingfac": [3, 4, 8, 9], "usabl": 3, "effici": 3, "work_dir": 3, "checkpoint": 3, "tracer": [3, 5, 7, 13], "map": [3, 9], "overrid": 3, "func": 3, "most": [3, 9], "common": [3, 13], "can": [3, 8, 9], "access": 3, "nest": 3, "manner": 3, "select": [3, 4, 5, 8, 9, 10], "classmethod": [3, 4], "from_dict": 3, "from_xx": 3, "constructor": 3, "construct": [3, 9], "add_column": 3, "add": [3, 4, 9], "select_column": 3, "remove_column": 3, "remov": [3, 5, 6, 8, 9], "cleanup_cache_fil": 3, "clear": 3, "raw": [3, 9], "compress": 3, "load_from_disk": 3, "wa": [3, 9], "previous": 3, "save_to_disk": 3, "directori": [3, 4, 8], "filesystem": 3, "ani": [3, 6, 8, 9], "implement": [3, 7], "fsspec": 3, "spec": 3, "abstractfilesystem": 3, "dataset_path": [3, 4], "like": [3, 6, 7, 8, 9], "train": [3, 9], "remot": [3, 9], "uri": 3, "s3": 3, "my": 3, "where": 3, "keep_in_memori": 3, "copi": 3, "memori": 3, "unless": 3, "explicitli": 3, "in_memory_max_s": 3, "nonzero": 3, "see": [3, 13], "more": [3, 8, 9, 13], "detail": [3, 8, 9, 13], "improv": 3, "section": 3, "storage_opt": 3, "kei": [3, 4, 5, 8, 9, 10], "pair": [3, 5, 7, 8, 9], "pass": [3, 9], "system": [3, 9], "backend": 3, "ad": [3, 6, 9], "version": [3, 9], "2": [3, 6, 8, 9], "8": [3, 8, 9], "request": [3, 9], "datasetdict": 3, "exampl": [3, 8, 9], "py": [3, 4], "d": [3, 4, 9], "unifi": [3, 4], "order": [3, 9, 10], "sample_data": 3, "dataset_to_sampl": 3, "sample_ratio": 3, "float": [3, 7, 8, 9, 10], "sample_algo": 3, "uniform": [3, 8, 9], "kwarg": [3, 4, 5, 7, 8, 9, 10], "subset": [3, 4], "given": [3, 8, 9], "formatt": [3, 4], "link": [3, 9], "ratio": [3, 4, 6, 8, 9, 10], "algorithm": [3, 7, 9], "frequency_specified_field_selector": 3, "topk_specified_field_selector": 3, "export_path": 3, "export_shard_s": 3, "export_in_parallel": 3, "export_d": 3, "keep_stats_in_res_d": 3, "keep_hashes_in_res_d": 3, "export_stat": 3, "kib": 3, "1024": 3, "mib": 3, "1048576": 3, "gib": 3, "1073741824": 3, "tib": 3, "1099511627776": 3, "shard": 3, "content": [3, 9], "keep": [3, 5, 7, 8, 9], "hash": [3, 5, 7], "export_compute_stat": 3, "statu": 3, "to_jsonl": 3, "jsonl": [3, 4], "extra": [3, 4, 7, 8, 9, 10], "to_json": 3, "to_parquet": 3, "parquet": [3, 4], "monitor": [3, 13], "other": [3, 8, 9], "dure": 3, "python": 3, "time": [3, 9], "10": [3, 8, 9], "interv": 3, "timestamp": 3, "xxx": 3, "cpu": 3, "count": [3, 8], "free": 3, "mem": 3, "structur": 3, "abov": [3, 9], "field": [3, 4, 5, 7, 8, 9, 10], "first": [3, 6, 7, 8, 9], "level": [3, 5, 6, 7, 8, 9, 10], "resource_analysi": 3, "min": [3, 7, 8, 9], "avg": [3, 8], "those": [3, 8, 9], "dynamic_field": 3, "monitor_all_resourc": 3, "detect": [3, 7, 8, 9], "node": 3, "monitor_current_resourc": 3, "machin": 3, "rang": [3, 8, 9, 10], "mb": [3, 8], "draw_resource_util_graph": 3, "resource_util_list": 3, "store_dir": 3, "analyze_resource_util_list": 3, "metric": [3, 5, 7, 8], "analyze_single_resource_util": 3, "resource_util_dict": 3, "monitor_func": 3, "show_num": [3, 5, 7], "trace": [3, 5, 7], "chang": [3, 9], "befor": [3, 8], "comparison": 3, "work": [3, 8, 9], "maximum": [3, 8, 9], "trace_mapp": 3, "op_nam": 3, "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": 3, "mapper": [3, 5, 13], "mainli": 3, "differ": [3, 4, 6, 7, 8, 9], "due": 3, "modif": 3, "trace_batch_mapp": 3, "batchmapp": 3, "new": [3, 4, 9], "augment": [3, 6, 8, 9], "trace_filt": 3, "trace_dedupl": 3, "dup_pair": 3, "dedupl": [3, 5, 9, 13], "duplic": [3, 5, 7], "extract": [3, 8, 9], "two": [3, 7, 8, 9], "embed": 3, "independ": [3, 8, 9], "obtain": [3, 6], "load_formatt": [4, 13], "generated_dataset_config": [4, 9], "suffix": [4, 8], "add_suffix": 4, "baseformatt": 4, "mixtur": 4, "weight": [4, 7, 9], "creat": 4, "provid": [4, 7, 9], "must": [4, 8, 9], "contain": [4, 6, 8, 9], "info": [4, 5], "jsonformatt": [4, 13], "localformatt": [4, 13], "zst": 4, "local": 4, "packag": 4, "modul": [4, 13], "csv": 4, "load_dataset": 4, "global_cfg": 4, "its": [4, 5, 7, 9], "consequ": 4, "remoteformatt": [4, 13], "repositori": 4, "hub": 4, "textformatt": [4, 13], "txt": [4, 8], "pdf": [4, 8], "cpp": 4, "docx": [4, 8], "md": 4, "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "c": 4, "h": [4, 8, 9], "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 8, 9], "java": 4, "j": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "rb": 4, "r": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "tsx": 4, "vb": 4, "makefil": 4, "xml": [4, 8, 9], "rst": 4, "m": [4, 9], "smali": 4, "datas": 4, "unified_format_dataset": 4, "parquetformatt": [4, 13], "csvformatt": [4, 13], "tsvformatt": [4, 13], "tsv": 4, "delimit": [4, 9], "mixtureformatt": [4, 13], "max_sampl": 4, "mix": [4, 9], "randomli": [4, 9], "everi": 4, "them": [4, 7, 8, 9], "datasset": 4, "dir": 4, "w1": 4, "w2": 4, "ds_dir": 4, "w3": 4, "ds_file": 4, "random_sampl": 4, "sample_numb": 4, "seed": [4, 9], "bigger": [4, 9], "than": [4, 6, 7, 8, 9, 10], "instead": [4, 6], "random": [4, 9, 10], "42": 4, "emptyformatt": [4, 9, 13], "feature_kei": [4, 9], "empti": [4, 7, 9], "featur": 4, "properti": 4, "null_valu": 4, "rayemptyformatt": [4, 9, 13], "rai": [4, 7, 9], "load_op": [5, 13], "process_list": 5, "instanc": 5, "image_kei": 5, "audio_kei": 5, "audio": [5, 8, 9], "video_kei": [5, 9], "video": [5, 7, 8, 9], "compute_stats_batch": [5, 8], "process_batch": [5, 8, 9], "compute_stats_singl": [5, 7, 8], "context": [5, 7, 8, 9], "intermedi": [5, 7, 8], "var": [5, 7, 8], "temporarili": [5, 7, 8], "process_singl": [5, 7, 8, 9], "boolean": [5, 7, 8], "reduc": [5, 8, 9], "conduct": 5, "edit": 5, "compute_hash": [5, 7], "doc": [5, 7], "open": [5, 7, 9], "selector": [5, 13], "get_sentences_from_docu": [6, 13], "document": [6, 7, 8, 9], "model_func": 6, "sentenc": [6, 9], "splite": 6, "separ": [6, 8, 9, 10], "n": [6, 8, 9], "get_words_from_docu": [6, 13], "token_func": 6, "new_lin": 6, "tab": 6, "word": [6, 8, 9], "stopword": [6, 8], "token": [6, 7, 8, 9], "merge_on_whitespace_tab_newlin": [6, 13], "invert": 6, "split_on_newline_tab_whitespac": [6, 13], "concaten": [6, 9], "split_on_whitespac": [6, 13], "also": 6, "space": [6, 7], "tag": [6, 8, 9], "strip": [6, 13], "strip_charact": 6, "wai": [6, 9], "faster": 6, "sinc": 6, "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "words_augment": [6, 13], "group_siz": 6, "join_char": 6, "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 9], "vietnames": [6, 8], "syllabl": 6, "group": [6, 8], "words_refin": [6, 13], "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "words_aug_join_char": [6, 8], "refin": 6, "non": [6, 7, 9], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7, 9], "lower": [6, 7, 8, 9, 10], "case": [6, 7, 8, 9, 13], "lowercas": [6, 7, 9], "char": [6, 8, 9], "split_text_by_punctu": [6, 13], "zh": [6, 8], "punctuat": [6, 7, 9], "documentdedupl": [7, 13], "ignore_non_charact": 7, "exact": 7, "match": [7, 8, 9], "md5": 7, "ignor": [7, 9], "alphabet": [7, 8, 9], "whitespac": [7, 9], "digit": 7, "documentminhashdedupl": [7, 13], "window_s": 7, "ignore_pattern": 7, "num_permut": 7, "256": 7, "jaccard_threshold": 7, "7": [7, 9], "num_band": 7, "num_rows_per_band": 7, "tokenizer_model": 7, "minhashlsh": 7, "simhash": 7, "minhash": 7, "byte": [7, 8], "so": [7, 8, 9], "thei": [7, 9], "kept": [7, 8, 9], "final": [7, 9], "sentencepiec": 7, "english": [7, 8, 9], "recommend": [7, 9], "pleas": [7, 9], "shingl": 7, "pattern": [7, 9], "permut": 7, "jaccard": 7, "similar": [7, 8, 9], "regard": [7, 9], "band": 7, "lsh": 7, "determin": [7, 9, 10], "optim": [7, 9], "minim": 7, "sum": 7, "prob": 7, "posit": [7, 8, 9], "neg": [7, 9], "row": 7, "documentsimhashdedupl": [7, 13], "6": [7, 8, 9], "num_block": 7, "hamming_dist": 7, "4": [7, 8, 9], "And": [7, 9], "block": 7, "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8, 9, 10], "imagededupl": [7, 13], "phash": 7, "consider_text": 7, "togeth": [7, 9], "raybasicdedupl": [7, 13], "redis_host": 7, "localhost": 7, "redis_port": 7, "6380": 7, "basic": 7, "although": 7, "empty_hash_valu": 7, "hostnam": 7, "redi": 7, "server": 7, "port": 7, "calculate_hash": 7, "calcul": [7, 8, 9], "raydocumentdedupl": [7, 13], "rayimagededupl": [7, 13], "rayvideodedupl": [7, 13], "videodedupl": [7, 13], "alphanumericfilt": [8, 13], "min_ratio": [8, 9], "25": 8, "max_ratio": [8, 9], "9223372036854775807": [8, 9], "numer": [8, 9], "within": [8, 9, 10], "alphanumer": 8, "total": [8, 9], "below": [8, 9], "audiodurationfilt": [8, 13], "min_dur": 8, "max_dur": 8, "any_or_al": [8, 9], "durat": [8, 9], "second": [8, 9], "sy": 8, "maxsiz": 8, "strategi": [8, 9], "meet": [8, 9], "condit": [8, 9], "audionmfsnrfilt": [8, 13], "min_snr": 8, "max_snr": 8, "nmf_iter_num": 8, "500": [8, 9], "snr": 8, "nmf": 8, "db": 8, "audiosizefilt": [8, 13], "min_siz": 8, "max_siz": 8, "1tb": 8, "kb": 8, "constraint": 8, "approxim": 8, "un": 8, "limit": 8, "averagelinelengthfilt": [8, 13], "min_len": [8, 9], "max_len": [8, 9], "characterrepetitionfilt": [8, 13], "rep_len": 8, "gram": 8, "repetit": 8, "flaggedwordfilt": [8, 13], "lang": [8, 9], "045": 8, "flagged_words_dir": 8, "home": 8, "runner": 8, "asset": 8, "flag": 8, "what": [8, 9], "adopt": 8, "flagged_word": 8, "join": 8, "imageaestheticsfilt": [8, 13], "hf_scorer_model": 8, "trust_remote_cod": [8, 9], "min_scor": 8, "max_scor": 8, "aesthet": 8, "score": [8, 9], "predictor": 8, "By": [8, 9], "shunk031": 8, "v2": 8, "sac": 8, "logo": 8, "ava1": 8, "l14": 8, "linearms": 8, "refer": [8, 9], "pypi": 8, "org": [8, 9], "project": 8, "simpl": [8, 9], "predict": 8, "keyword": [8, 9], "imageaspectratiofilt": [8, 13], "333": 8, "3": [8, 9], "aspect": [8, 9], "aspectratio": [8, 9], "w": [8, 9], "imagefacecountfilt": [8, 13], "cv_classifi": [8, 9], "min_face_count": 8, "max_face_count": 8, "face": [8, 9], "opencv": [8, 9], "classifi": [8, 9], "haarcascade_frontalface_alt": [8, 9], "minimum": [8, 9], "requir": 8, "imagefaceratiofilt": [8, 13], "area": 8, "largest": [8, 10], "imagensfwfilt": [8, 13], "hf_nsfw_model": 8, "falconsai": 8, "nsfw_image_detect": 8, "score_threshold": 8, "have": [8, 9], "low": 8, "nsfw": 8, "imagepairsimilarityfilt": [8, 13], "hf_clip": 8, "openai": 8, "clip": [8, 9], "vit": 8, "patch32": 8, "closedunitinterv": 8, "imageshapefilt": [8, 13], "min_width": [8, 9], "max_width": [8, 9], "min_height": [8, 9], "max_height": [8, 9], "shape": 8, "width": [8, 9], "height": [8, 9], "imagesizefilt": [8, 13], "imagetextmatchingfilt": [8, 13], "hf_blip": 8, "salesforc": [8, 9], "blip": [8, 9], "itm": 8, "coco": 8, "003": 8, "horizontal_flip": [8, 9], "vertical_flip": [8, 9], "reduce_mod": 8, "flip": [8, 9], "horizont": [8, 9], "left": [8, 9], "right": [8, 9], "vertic": [8, 9], "top": [8, 9, 10], "bottom": [8, 9], "mode": [8, 9], "correspond": [8, 9, 10], "chunk": [8, 9], "take": 8, "imagetextsimilarityfilt": [8, 13], "imagewatermarkfilt": [8, 13], "hf_watermark_model": 8, "amrul": 8, "hzz": 8, "watermark_detector": 8, "prob_threshold": 8, "watermark": [8, 9], "high": [8, 9], "probabl": [8, 9], "languageidscorefilt": [8, 13], "confid": 8, "larger": [8, 9, 10], "identif": 8, "maximumlinelengthfilt": [8, 13], "perplexityfilt": [8, 13], "max_ppl": 8, "1500": 8, "perplex": 8, "phrasegroundingrecallfilt": [8, 13], "hf_owlvit": 8, "googl": 8, "owlvit": 8, "min_recal": 8, "max_recal": 8, "iou_thr": 8, "large_area_ratio_thr": 8, "95": [8, 9], "conf_thr": 8, "locat": [8, 9], "recal": 8, "phrase": 8, "owl": 8, "ground": 8, "iou": 8, "nm": 8, "post": 8, "bbox": 8, "overlap": [8, 9], "out": 8, "larg": 8, "account": 8, "specialcharactersfilt": [8, 13], "specifiedfieldfilt": [8, 13], "field_kei": [8, 10], "target_valu": 8, "multi": [8, 9, 10, 13], "retain": [8, 9], "specifiednumericfieldfilt": [8, 13], "min_valu": 8, "max_valu": 8, "specifiednumericfield": 8, "stopwordsfilt": [8, 13], "stopwords_dir": 8, "suffixfilt": [8, 13], "textactionfilt": [8, 13], "min_action_num": 8, "action": [8, 9], "mini_action_num": 8, "textentitydependencyfilt": [8, 13], "min_dependency_num": 8, "identifi": [8, 9], "entiti": [8, 9], "omit": 8, "mini_dependency_num": 8, "edg": [8, 9], "depend": [8, 9], "objet": 8, "textlengthfilt": [8, 13], "tokennumfilt": [8, 13], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "hug": [8, 9], "videoaestheticsfilt": [8, 13], "frame_sampling_method": [8, 9], "frame_num": [8, 9], "frame": [8, 9], "all_keyfram": [8, 9], "former": [8, 9], "latter": [8, 9], "uniformli": [8, 9], "keyfram": 8, "while": 8, "usual": 8, "term": 8, "middl": [8, 9], "last": [8, 9], "addit": [8, 9], "videoaspectratiofilt": [8, 13], "21": [8, 9], "videodurationfilt": [8, 13], "videoframestextsimilarityfilt": [8, 13], "kind": [8, 9], "chineseclip": 8, "might": [8, 9], "choic": [8, 9], "videomotionscorefilt": [8, 13], "7976931348623157e": 8, "308": 8, "sampling_fp": 8, "tupl": 8, "divis": [8, 9], "rel": 8, "motion": 8, "farneback": 8, "algorith": 8, "dens": 8, "optic": 8, "flow": 8, "rate": 8, "frames_per_second": 8, "resiz": [8, 9], "sequenc": [8, 9], "smaller": [8, 9, 10], "rescal": 8, "allow": [8, 9], "longer": 8, "greater": [8, 9, 10], "being": [8, 9], "overrul": 8, "equal": [8, 9, 10], "As": 8, "mai": [8, 9], "shorter": [8, 9], "dimens": [8, 9], "magnitud": 8, "normal": [8, 9], "diagon": 8, "setup_model": 8, "compute_flow": 8, "prev_fram": 8, "curr_fram": 8, "videomotionscoreraftfilt": [8, 13], "raft": 8, "recurr": 8, "transform": [8, 9], "torchvis": 8, "further": 8, "offici": 8, "http": [8, 9], "pytorch": 8, "vision": [8, 9], "main": [8, 9], "paper": 8, "here": [8, 9, 13], "arxiv": 8, "ab": 8, "2003": 8, "12039": 8, "videonsfwfilt": [8, 13], "videoocrarearatiofilt": [8, 13], "min_area_ratio": 8, "max_area_ratio": 8, "frame_sample_num": 8, "languages_to_detect": 8, "ch_sim": 8, "ocr": [8, 9], "evenli": 8, "full": [8, 9], "found": [8, 9], "www": 8, "jaid": 8, "ai": [8, 9], "easyocr": 8, "get_read": 8, "videoresolutionfilt": [8, 13], "resolut": [8, 9], "videotaggingfromframesfilt": [8, 13], "peopl": 8, "tag_field_nam": [8, 9], "__dj__video_frame_tags__": [8, 9], "shift": [8, 9], "github": 8, "com": 8, "xinyu1205": 8, "recogn": 8, "anyth": 8, "blob": 8, "ram": 8, "ram_tag_list": 8, "noqa": 8, "e501": 8, "videowatermarkfilt": [8, 13], "wordrepetitionfilt": [8, 13], "wordsnumfilt": [8, 13], "audioffmpegwrappedmapp": [9, 13], "filter_nam": 9, "filter_kwarg": 9, "global_arg": 9, "capture_stderr": 9, "overwrite_output": 9, "wrapper": 9, "ffmpeg": 9, "captur": 9, "stderr": 9, "output": 9, "calibrateqamapp": [9, 13], "api_model": 9, "gpt": 9, "4o": 9, "api_endpoint": 9, "response_path": 9, "system_prompt": 9, "input_templ": 9, "reference_templ": 9, "qa_pair_templ": 9, "output_pattern": 9, "try_num": 9, "model_param": 9, "sampling_param": 9, "calibr": 9, "question": 9, "answer": 9, "default_system_prompt": 9, "\u8bf7\u6839\u636e\u63d0\u4f9b\u7684": 9, "\u53c2\u8003\u4fe1\u606f": 9, "\u5bf9": 9, "\u95ee\u9898": 9, "\u548c": 9, "\u56de\u7b54": 9, "\u8fdb\u884c\u6821\u51c6": 9, "\u4f7f\u5176\u66f4\u52a0\u8be6\u7ec6": 9, "\u51c6\u786e": 9, "n\u6309\u7167\u4ee5\u4e0b\u683c\u5f0f\u8f93\u51fa": 9, "n\u6821\u51c6\u540e\u7684\u95ee\u9898": 9, "n\u6821\u51c6\u540e\u7684\u56de\u7b54": 9, "default_input_templ": 9, "qa_pair": 9, "default_reference_templ": 9, "default_qa_pair_templ": 9, "default_output_pattern": 9, "url": 9, "endpoint": 9, "respons": 9, "messag": 9, "prompt": 9, "task": 9, "templat": 9, "build": 9, "regular": 9, "express": 9, "temperatur": 9, "top_p": 9, "build_input": 9, "parse_output": 9, "raw_output": 9, "calibratequerymapp": [9, 13], "queri": 9, "\u5bf9\u95ee\u7b54\u5bf9\u4e2d\u7684": 9, "\u4e14\u4ecd\u53ef\u4ee5\u7531\u539f\u7b54\u6848\u56de\u7b54": 9, "\u53ea\u8f93\u51fa\u6821\u51c6\u540e\u7684\u95ee\u9898": 9, "\u4e0d\u8981\u8f93\u51fa\u591a\u4f59\u5185\u5bb9": 9, "calibrateresponsemapp": [9, 13], "\u4e14\u4ecd\u53ef\u4ee5\u56de\u7b54\u539f\u95ee\u9898": 9, "\u53ea\u8f93\u51fa\u6821\u51c6\u540e\u7684\u56de\u7b54": 9, "chineseconvertmapp": [9, 13], "s2t": 9, "tradit": 9, "simplifi": 9, "japanes": 9, "kanji": 9, "choos": 9, "t2": 9, "s2tw": 9, "taiwan": 9, "standard": 9, "tw2": 9, "s2hk": 9, "hong": 9, "kong": 9, "variant": 9, "hk2": 9, "s2twp": 9, "taiwanes": 9, "idiom": 9, "tw2sp": 9, "mainland": 9, "t2tw": 9, "tw2t": 9, "hk2t": 9, "t2hk": 9, "t2jp": 9, "ky\u016bjitai": 9, "jp2t": 9, "shinjitai": 9, "cleancopyrightmapp": [9, 13], "clean": 9, "copyright": 9, "comment": 9, "begin": 9, "cleanemailmapp": [9, 13], "repl": 9, "email": 9, "search": [9, 13], "replac": 9, "cleanhtmlmapp": [9, 13], "cleanipmapp": [9, 13], "ipv4": 9, "ipv6": 9, "address": 9, "cleanlinksmapp": [9, 13], "ftp": 9, "expandmacromapp": [9, 13], "expand": 9, "macro": 9, "definit": 9, "bodi": 9, "latex": 9, "extractentityattributemapp": [9, 13], "query_ent": 9, "query_attribut": 9, "entity_kei": 9, "__dj__main_entity__": 9, "attribute_kei": 9, "__dj__attribute__": 9, "attribute_desc_kei": 9, "__dj__attribute_description__": 9, "support_text_kei": 9, "__dj__attribute_support_text__": 9, "system_prompt_templ": 9, "attr_pattern_templ": 9, "demo_pattern": 9, "drop_text": 9, "attribut": 9, "default_system_prompt_templ": 9, "\u7ed9\u5b9a\u4e00\u6bb5\u6587\u672c": 9, "\u4ece\u6587\u672c\u4e2d\u603b\u7ed3": 9, "\u7684": 9, "\u5e76\u4e14\u4ece\u539f\u6587\u6458\u5f55\u6700\u80fd\u8bf4\u660e\u8be5": 9, "\u7684\u4ee3\u8868\u6027\u793a\u4f8b": 9, "n\u8981\u6c42": 9, "\u6458\u5f55\u7684\u793a\u4f8b\u5e94\u8be5\u7b80\u77ed": 9, "\u9075\u5faa\u5982\u4e0b\u7684\u56de\u590d\u683c\u5f0f": 9, "\u63cf\u8ff0": 9, "\u4ee3\u8868\u6027\u793a\u4f8b1": 9, "n\u8bf4\u660e": 9, "\u8be5": 9, "\u7684\u539f\u6587\u6458\u5f551": 9, "\u4ee3\u8868\u6027\u793a\u4f8b2": 9, "\u7684\u539f\u6587\u6458\u5f552": 9, "\u6587\u672c": 9, "default_attr_pattern_templ": 9, "z": 9, "default_demon_pattern": 9, "\u4ee3\u8868\u6027\u793a\u4f8b": 9, "__dj__entity__": 9, "entity_attribute_kei": 9, "descript": 9, "__dj__support_text__": 9, "retri": 9, "attempt": 9, "error": 9, "drop": 9, "demonstract": 9, "attribute_nam": 9, "extractentityrelationmapp": [9, 13], "entity_typ": 9, "relation_kei": 9, "__dj__relation__": 9, "prompt_templ": 9, "tuple_delimit": 9, "record_delimit": 9, "completion_delimit": 9, "max_glean": 9, "continue_prompt": 9, "if_loop_prompt": 9, "entity_pattern": 9, "relation_pattern": 9, "knowledg": 9, "graph": 9, "default_prompt_templ": 9, "goal": 9, "ngiven": 9, "potenti": 9, "relev": 9, "activ": 9, "relationship": 9, "among": 9, "step": 9, "n1": 9, "entity_nam": 9, "One": 9, "entity_descript": 9, "comprehens": 9, "nformat": 9, "n2": 9, "source_ent": 9, "target_ent": 9, "clearli": 9, "nfor": 9, "relationship_descript": 9, "explan": 9, "why": 9, "you": 9, "think": 9, "relationship_strength": 9, "strength": 9, "relationship_keyword": 9, "summar": 9, "overarch": 9, "natur": 9, "focus": 9, "concept": 9, "theme": 9, "rather": 9, "n3": 9, "n4": 9, "finish": 9, "nexampl": 9, "nentity_typ": 9, "person": 9, "technologi": 9, "mission": 9, "organ": 9, "ntext": 9, "nwhile": 9, "alex": 9, "clench": 9, "hi": 9, "jaw": 9, "buzz": 9, "frustrat": 9, "dull": 9, "against": 9, "backdrop": 9, "taylor": 9, "authoritarian": 9, "certainti": 9, "competit": 9, "undercurr": 9, "him": 9, "alert": 9, "sens": 9, "jordan": 9, "share": 9, "commit": 9, "discoveri": 9, "unspoken": 9, "rebellion": 9, "cruz": 9, "narrow": 9, "control": 9, "nthen": 9, "did": 9, "someth": 9, "unexpect": 9, "paus": 9, "besid": 9, "moment": 9, "observ": 9, "devic": 9, "akin": 9, "rever": 9, "tech": 9, "understood": 9, "said": 9, "voic": 9, "quieter": 9, "could": 9, "game": 9, "u": 9, "nthe": 9, "underli": 9, "dismiss": 9, "earlier": 9, "seem": 9, "falter": 9, "glimps": 9, "reluct": 9, "graviti": 9, "lai": 9, "hand": 9, "look": 9, "up": 9, "fleet": 9, "heartbeat": 9, "ey": 9, "lock": 9, "wordless": 9, "clash": 9, "wills": 9, "soften": 9, "uneasi": 9, "truce": 9, "nit": 9, "bare": 9, "percept": 9, "note": 9, "inward": 9, "nod": 9, "had": 9, "been": 9, "brought": 9, "noutput": 9, "who": 9, "experi": 9, "dynam": 9, "portrai": 9, "toward": 9, "perspect": 9, "ha": 9, "signific": 9, "interact": 9, "associ": 9, "influenc": 9, "central": 9, "stori": 9, "implic": 9, "affect": 9, "attitud": 9, "power": 9, "contrast": 9, "directli": 9, "lead": 9, "mutual": 9, "conflict": 9, "ideolog": 9, "import": 9, "impact": 9, "technolog": 9, "\u4eba\u7269": 9, "\u6280\u672f": 9, "\u4efb\u52a1": 9, "\u7ec4\u7ec7": 9, "\u5730\u70b9": 9, "n\u4ed6\u4eec\u4e0d\u518d\u662f\u5355\u7eaf\u7684\u6267\u884c\u8005": 9, "\u4ed6\u4eec\u5df2\u6210\u4e3a\u67d0\u4e2a\u8d85\u8d8a\u661f\u8fb0\u4e0e\u6761\u7eb9\u7684\u9886\u57df\u7684\u4fe1\u606f\u5b88\u62a4\u8005": 9, "\u8fd9\u4e00\u4f7f\u547d\u7684\u63d0\u5347\u4e0d\u80fd\u88ab\u89c4\u5219\u548c\u65e2\u5b9a\u534f\u8bae\u6240\u675f\u7f1a": 9, "\u5b83\u9700\u8981\u4e00\u79cd\u65b0\u7684\u89c6\u89d2": 9, "\u4e00\u79cd\u65b0\u7684\u51b3\u5fc3": 9, "n\u968f\u7740\u4e0e\u534e\u76db\u987f\u7684\u901a\u8baf\u5728\u80cc\u666f\u4e2d\u55e1\u55e1\u4f5c\u54cd": 9, "\u5bf9\u8bdd\u4e2d\u7684\u7d27\u5f20\u60c5\u7eea\u901a\u8fc7\u561f\u561f\u58f0\u548c\u9759\u7535\u566a\u97f3\u8d2f\u7a7f\u59cb\u7ec8": 9, "\u56e2\u961f\u7ad9\u7acb\u7740": 9, "\u4e00\u80a1\u4e0d\u7965\u7684\u6c14\u606f\u7b3c\u7f69\u7740\u4ed6\u4eec": 9, "\u663e\u7136": 9, "\u4ed6\u4eec\u5728\u63a5\u4e0b\u6765\u51e0\u4e2a\u5c0f\u65f6\u5185\u505a\u51fa\u7684\u51b3\u5b9a\u53ef\u80fd\u4f1a\u91cd\u65b0\u5b9a\u4e49\u4eba\u7c7b\u5728\u5b87\u5b99\u4e2d\u7684\u4f4d\u7f6e": 9, "\u6216\u8005\u5c06\u4ed6\u4eec\u7f6e\u4e8e\u65e0\u77e5\u548c\u6f5c\u5728\u5371\u9669\u4e4b\u4e2d": 9, "n\u968f\u7740\u4e0e\u661f\u8fb0\u7684\u8054\u7cfb\u53d8\u5f97\u66f4\u52a0\u7262\u56fa": 9, "\u5c0f\u7ec4\u5f00\u59cb\u5904\u7406\u9010\u6e10\u6210\u5f62\u7684\u8b66\u544a": 9, "\u4ece\u88ab\u52a8\u63a5\u53d7\u8005\u8f6c\u53d8\u4e3a\u79ef\u6781\u53c2\u4e0e\u8005": 9, "\u6885\u745f\u540e\u6765\u7684\u76f4\u89c9\u5360\u636e\u4e86\u4e0a\u98ce": 9, "\u56e2\u961f\u7684\u4efb\u52a1\u5df2\u7ecf\u6f14\u53d8": 9, "\u4e0d\u518d\u4ec5\u4ec5\u662f\u89c2\u5bdf\u548c\u62a5\u544a": 9, "\u800c\u662f\u4e92\u52a8\u548c\u51c6\u5907": 9, "\u4e00\u573a\u8715\u53d8\u5df2\u7ecf\u5f00\u59cb": 9, "\u800c": 9, "\u675c\u5c14\u585e\u884c\u52a8": 9, "\u5219\u4ee5\u4ed6\u4eec\u5927\u80c6\u7684\u65b0\u9891\u7387\u9707\u52a8": 9, "\u8fd9\u79cd\u57fa\u8c03\u4e0d\u662f\u7531\u4e16\u4fd7\u8bbe\u5b9a\u7684": 9, "\u534e\u76db\u987f": 9, "\u534e\u76db\u987f\u662f\u6b63\u5728\u63a5\u6536\u901a\u8baf\u7684\u5730\u65b9": 9, "\u8868\u660e\u5176\u5728\u51b3\u7b56\u8fc7\u7a0b\u4e2d\u7684\u91cd\u8981\u6027": 9, "\u675c\u5c14\u585e\u884c\u52a8\u88ab\u63cf\u8ff0\u4e3a\u4e00\u9879\u5df2\u6f14\u53d8\u4e3a\u4e92\u52a8\u548c\u51c6\u5907\u7684\u4efb\u52a1": 9, "\u663e\u793a\u51fa\u76ee\u6807\u548c\u6d3b\u52a8\u7684\u91cd\u5927\u8f6c\u53d8": 9, "\u56e2\u961f": 9, "\u56e2\u961f\u88ab\u63cf\u7ed8\u6210\u4e00\u7fa4\u4ece\u88ab\u52a8\u89c2\u5bdf\u8005\u8f6c\u53d8\u4e3a\u79ef\u6781\u53c2\u4e0e\u8005\u7684\u4eba": 9, "\u5c55\u793a\u4e86\u4ed6\u4eec\u89d2\u8272\u7684\u52a8\u6001\u53d8\u5316": 9, "\u56e2\u961f\u6536\u5230\u6765\u81ea\u534e\u76db\u987f\u7684\u901a\u8baf": 9, "\u8fd9\u5f71\u54cd\u4e86\u4ed6\u4eec\u7684\u51b3\u7b56\u8fc7\u7a0b": 9, "\u51b3\u7b56": 9, "\u5916\u90e8\u5f71\u54cd": 9, "\u56e2\u961f\u76f4\u63a5\u53c2\u4e0e\u675c\u5c14\u585e\u884c\u52a8": 9, "\u6267\u884c\u5176\u6f14\u53d8\u540e\u7684\u76ee\u6807\u548c\u6d3b\u52a8": 9, "\u4efb\u52a1\u6f14\u53d8": 9, "\u79ef\u6781\u53c2\u4e0e": 9, "role": 9, "event": 9, "ntheir": 9, "slice": 9, "through": 9, "illus": 9, "intellig": 9, "liter": 9, "write": 9, "own": 9, "rule": [9, 10], "state": 9, "stoical": 9, "cast": 9, "watch": 9, "over": 9, "flurri": 9, "learn": 9, "commun": 9, "offer": 9, "sam": 9, "rivera": 9, "nearbi": 9, "interfac": 9, "youth": 9, "energi": 9, "bode": 9, "aw": 9, "anxieti": 9, "give": [9, 13], "talk": 9, "stranger": 9, "nalex": 9, "survei": 9, "team": 9, "studi": 9, "concentr": 9, "measur": 9, "trepid": 9, "well": 9, "our": 9, "contact": 9, "he": 9, "acknowledg": 9, "readi": 9, "whatev": 9, "back": 9, "ntogeth": 9, "stood": 9, "unknown": 9, "forg": 9, "human": 9, "heaven": 9, "ensu": 9, "silenc": 9, "palpabl": 9, "collect": 9, "introspect": 9, "about": 9, "grand": 9, "cosmic": 9, "plai": 9, "rewrit": 9, "histori": 9, "encrypt": 9, "dialogu": 9, "continu": 9, "unfold": 9, "intric": 9, "almost": 9, "uncanni": 9, "anticip": 9, "member": 9, "leader": 9, "abil": 9, "govern": 9, "challeng": 9, "capabl": 9, "taken": 9, "involv": 9, "make": 9, "leadership": 9, "explor": 9, "autonomi": 9, "real": 9, "input_text": 9, "default_continue_prompt": 9, "mani": 9, "were": 9, "miss": 9, "same": 9, "default_if_loop_prompt": 9, "appear": 9, "still": 9, "ye": 9, "NO": 9, "default_entity_typ": 9, "geo": 9, "default_tuple_delimit": 9, "default_record_delimit": 9, "default_completion_delimit": 9, "complet": 9, "default_entity_pattern": 9, "default_relation_pattern": 9, "defin": 9, "record": 9, "To": 9, "mark": 9, "end": 9, "num": 9, "llm": 9, "glean": 9, "stop": 9, "add_messag": 9, "light_rag_extract": 9, "extracteventmapp": [9, 13], "event_desc_kei": 9, "__dj__event_description__": 9, "relevant_char_kei": 9, "__dj__relevant_characters__": 9, "\u5bf9\u6587\u672c\u7684\u60c5\u8282\u8fdb\u884c\u5206\u70b9\u603b\u7ed3": 9, "\u5e76\u62bd\u53d6\u4e0e\u60c5\u8282\u76f8\u5173\u7684\u4eba\u7269": 9, "\u5c3d\u91cf\u4e0d\u8981\u9057\u6f0f\u5185\u5bb9": 9, "\u4e0d\u8981\u6dfb\u52a0\u6587\u672c\u4e2d\u6ca1\u6709\u7684\u60c5\u8282": 9, "\u7b26\u5408\u539f\u6587\u4e8b\u5b9e": 9, "\u8054\u7cfb\u4e0a\u4e0b\u6587\u8bf4\u660e\u524d\u56e0\u540e\u679c": 9, "\u4f46\u4ecd\u7136\u9700\u8981\u7b26\u5408\u4e8b\u5b9e": 9, "\u4e0d\u8981\u5305\u542b\u4e3b\u89c2\u770b\u6cd5": 9, "\u6ce8\u610f\u8981\u5c3d\u53ef\u80fd\u4fdd\u7559\u6587\u672c\u7684\u4e13\u6709\u540d\u8bcd": 9, "\u6ce8\u610f\u76f8\u5173\u4eba\u7269\u9700\u8981\u5728\u5bf9\u5e94\u60c5\u8282\u4e2d\u51fa\u73b0": 9, "\u53ea\u62bd\u53d6\u60c5\u8282\u4e2d\u7684\u4e3b\u8981\u4eba\u7269": 9, "\u4e0d\u8981\u9057\u6f0f\u60c5\u8282\u7684\u4e3b\u8981\u4eba\u7269": 9, "\u603b\u7ed3\u683c\u5f0f\u5982\u4e0b": 9, "\u60c5\u82821": 9, "\u60c5\u8282\u63cf\u8ff0": 9, "\u76f8\u5173\u4eba\u7269": 9, "\u4eba\u72691": 9, "\u4eba\u72692": 9, "\u4eba\u72693": 9, "\u60c5\u82822": 9, "\u60c5\u82823": 9, "\u60c5\u8282": 9, "extractkeywordmapp": [9, 13], "keyword_kei": 9, "__dj__keyword__": 9, "topic": 9, "entir": 9, "These": 9, "idea": 9, "present": 9, "content_keyword": 9, "high_level_keyword": 9, "\u51b3\u7b56\u5236\u5b9a": 9, "\u5b87\u5b99\u610f\u4e49": 9, "extractnicknamemapp": [9, 13], "nickname_kei": 9, "__dj__nickname__": 9, "nicknam": 9, "\u7ed9\u5b9a\u4f60\u4e00\u6bb5\u6587\u672c": 9, "\u4f60\u7684\u4efb\u52a1\u662f\u5c06\u4eba\u7269\u4e4b\u95f4\u7684\u79f0\u547c\u65b9\u5f0f": 9, "\u6635\u79f0": 9, "\u63d0\u53d6\u51fa\u6765": 9, "\u9700\u8981\u7ed9\u51fa\u8bf4\u8bdd\u4eba\u5bf9\u88ab\u79f0\u547c\u4eba\u7684\u79f0\u547c": 9, "\u4e0d\u8981\u641e\u53cd\u4e86": 9, "\u76f8\u540c\u7684\u8bf4\u8bdd\u4eba\u548c\u88ab\u79f0\u547c\u4eba\u6700\u591a\u7ed9\u51fa\u4e00\u4e2a\u6700\u5e38\u7528\u7684\u79f0\u547c": 9, "\u8bf7\u4e0d\u8981\u8f93\u51fa\u4e92\u76f8\u6ca1\u6709\u6635\u79f0\u7684\u79f0\u547c\u65b9\u5f0f": 9, "\u8f93\u51fa\u683c\u5f0f\u5982\u4e0b": 9, "\u79f0\u547c\u65b9\u5f0f1": 9, "\u8bf4\u8bdd\u4eba": 9, "\u88ab\u79f0\u547c\u4eba": 9, "\u7684\u6635\u79f0": 9, "\u79f0\u547c\u65b9\u5f0f2": 9, "\u79f0\u547c\u65b9\u5f0f3": 9, "\u79f0\u547c\u65b9\u5f0f": 9, "doubl": 9, "fixunicodemapp": [9, 13], "fix": 9, "unicod": 9, "form": 9, "nfc": 9, "nfkc": 9, "nfd": 9, "nfkd": 9, "generateqafromexamplesmapp": [9, 13], "hf_model": 9, "qwen": 9, "qwen2": 9, "7b": 9, "instruct": 9, "seed_fil": 9, "example_num": 9, "similarity_threshold": 9, "example_templ": 9, "enable_vllm": 9, "your": 9, "\u8bf7\u4f60\u4ed4\u7ec6\u89c2\u5bdf\u591a\u4e2a\u793a\u4f8b\u6570\u636e\u7684\u8f93\u5165\u548c\u8f93\u51fa": 9, "\u6309\u7167\u4f60\u7684\u7406\u89e3": 9, "\u603b\u7ed3\u51fa\u76f8\u5e94\u89c4\u77e9": 9, "\u7136\u540e\u5199\u51fa\u4e00\u4e2a\u65b0\u7684": 9, "\u6ce8\u610f": 9, "\u65b0\u751f\u6210\u7684": 9, "\u9700\u8981\u6ee1\u8db3\u5982\u4e0b\u8981\u6c42": 9, "\u751f\u6210\u7684": 9, "\u4e0d\u80fd\u4e0e\u8f93\u5165\u7684": 9, "\u4e00\u81f4": 9, "\u4f46\u662f\u9700\u8981\u4fdd\u6301\u683c\u5f0f\u76f8\u540c": 9, "\u4e0d\u4e00\u5b9a\u8981\u5c40\u9650\u4e8e\u8f93\u5165": 9, "\u7684\u8bdd\u9898\u6216\u9886\u57df": 9, "\u9700\u8981\u6b63\u786e\u56de\u7b54\u751f\u6210\u7684": 9, "\u63d0\u4f9b\u7684": 9, "\u53ef\u80fd\u662f\u591a\u8f6e\u5bf9\u8bdd": 9, "\u4e5f\u53ef\u4ee5\u662f\u591a\u8f6e": 9, "\u5fc5\u987b\u6210\u5bf9\u51fa\u73b0": 9, "\u800c\u4e14": 9, "\u9700\u8981\u5728": 9, "\u4e4b\u524d": 9, "default_example_templ": 9, "n\u5982\u4e0b\u662f\u4e00\u6761\u793a\u4f8b\u6570\u636e": 9, "hugginfac": 9, "id": 9, "chatml": 9, "put": 9, "qa": 9, "guid": 9, "placehold": 9, "vllm": 9, "infer": 9, "acceler": 9, "qa_exampl": 9, "generateqafromtextmapp": [9, 13], "alibaba": 9, "pai": 9, "qwen1_5": 9, "doc2qa": 9, "llama3": 9, "8b": 9, "baichuan2": 9, "4b": 9, "1b8": 9, "0b5": 9, "suitabl": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "ulaanbaatar": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "reykjavik": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "assist": 9, "\u4f60\u597d": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "imageblurmapp": [9, 13], "p": 9, "blur_typ": 9, "gaussian": 9, "radiu": 9, "blur": 9, "blure": 9, "kernel": 9, "imagecaptioningfromgpt4vmapp": [9, 13], "api_kei": 9, "max_token": 9, "user_prompt": 9, "user_prompt_kei": 9, "keep_original_sampl": 9, "visison": 9, "reson": 9, "convers": 9, "custom": 9, "authent": 9, "guidanc": [9, 13], "gpt4": 9, "uers_prompt_kei": 9, "imagecaptioningmapp": [9, 13], "hf_img2seq": 9, "blip2": 9, "opt": 9, "caption_num": 9, "keep_candidate_mod": 9, "random_ani": 9, "prompt_kei": 9, "caption": 9, "anoth": 9, "how": 9, "candid": 9, "similar_one_simhash": 9, "batched_op": 9, "both": [9, 10], "suppos": 9, "b": 9, "denot": 9, "2nb": 9, "nb": 9, "mnb": 9, "similar_on": 9, "imagediffusionmapp": [9, 13], "hf_diffus": 9, "compvi": 9, "stabl": 9, "diffus": 9, "v1": 9, "torch_dtyp": 9, "fp32": 9, "revis": 9, "guidance_scal": 9, "aug_num": 9, "caption_kei": 9, "point": 9, "fp16": 9, "bf16": 9, "branch": 9, "git": 9, "extent": 9, "start": 9, "nois": 9, "higher": 9, "denois": 9, "amount": 9, "num_inference_step": 9, "essenti": 9, "scale": 9, "encourag": 9, "close": 9, "expens": 9, "qualiti": 9, "produc": 9, "otherwis": 9, "imagefaceblurmapp": [9, 13], "imagetaggingmapp": [9, 13], "__dj__image_tags__": 9, "nlpaugenmapp": [9, 13], "sequenti": 9, "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "delete_random_char": 9, "swap_random_char": 9, "insert_random_char": 9, "simpli": 9, "nlpaug": 9, "librari": 9, "semant": 9, "significantli": 9, "combin": 9, "would": 9, "opened_aug_method": 9, "delet": 9, "love": 9, "swap": 9, "contigu": 9, "simul": 9, "spell": 9, "ll": 9, "keyboard": 9, "ov4": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "nlpcdazhmapp": [9, 13], "replace_similar_word": 9, "replace_homophone_char": 9, "replace_equivalent_num": 9, "nlpcda": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "optimizeqamapp": [9, 13], "\u8bf7\u4f18\u5316\u8f93\u5165\u7684\u95ee\u7b54\u5bf9": 9, "\u4f7f": 9, "\u90fd\u66f4\u52a0\u8be6\u7ec6": 9, "\u5fc5\u987b\u6309\u7167\u4ee5\u4e0b\u6807\u8bb0\u683c\u5f0f": 9, "\u76f4\u63a5\u8f93\u51fa\u4f18\u5316\u540e\u7684\u95ee\u7b54\u5bf9": 9, "n\u4f18\u5316\u540e\u7684\u95ee\u9898": 9, "n\u4f18\u5316\u540e\u7684\u56de\u7b54": 9, "\u4ee5\u4e0b\u662f\u539f\u59cb\u95ee\u7b54\u5bf9": 9, "sure": 9, "optimizequerymapp": [9, 13], "\u4f18\u5316\u95ee\u7b54\u5bf9\u4e2d\u7684": 9, "\u5c06\u5176\u66f4\u52a0\u8be6\u7ec6\u5177\u4f53": 9, "\u4f46\u4ecd\u53ef\u4ee5\u7531\u539f\u7b54\u6848\u56de\u7b54": 9, "\u53ea\u8f93\u51fa\u4f18\u5316\u540e\u7684": 9, "optimizeresponsemapp": [9, 13], "\u8bf7\u4f18\u5316\u95ee\u7b54\u5bf9\u4e2d\u7684\u56de\u7b54": 9, "\u4f46\u4ecd\u53ef\u4ee5\u56de\u7b54\u539f\u95ee\u9898": 9, "\u53ea\u8f93\u51fa\u4f18\u5316\u540e\u7684\u56de\u7b54": 9, "pairpreferencemapp": [9, 13], "rejected_kei": 9, "rejected_respons": 9, "reason_kei": 9, "reason": 9, "prefer": 9, "\u4f60\u7684\u4efb\u52a1\u662f\u6839\u636e\u53c2\u8003\u4fe1\u606f\u4fee\u6539\u95ee\u7b54\u5bf9\u4e2d\u7684\u56de\u7b54": 9, "\u5728\u8bed\u8a00\u98ce\u683c": 9, "\u4e8b\u5b9e\u6027": 9, "\u4eba\u7269\u8eab\u4efd": 9, "\u7acb\u573a\u7b49\u4efb\u4e00\u65b9\u9762\u4e0e\u539f\u56de\u7b54\u76f8\u53cd": 9, "\u5fc5\u987b\u6309\u7167\u4ee5\u4e0b\u6807\u8bb0\u683c\u5f0f\u8f93\u51fa": 9, "\u4e0d\u8981\u8f93\u51fa\u5176\u4ed6\u591a\u4f59\u5185\u5bb9": 9, "n\u751f\u6210\u7684\u65b0\u56de\u7b54": 9, "\u539f\u56e0": 9, "n\u751f\u6210\u8be5\u56de\u7b54\u7684\u539f\u56e0": 9, "n\u4ee5\u4e0b\u662f\u539f\u59cb\u95ee\u7b54\u5bf9": 9, "repons": 9, "reject": 9, "failur": 9, "punctuationnormalizationmapp": [9, 13], "removebibliographymapp": [9, 13], "bibliographi": 9, "removecommentsmapp": [9, 13], "doc_typ": 9, "inlin": 9, "multilin": 9, "removeheadermapp": [9, 13], "drop_no_head": 9, "header": 9, "removelongwordsmapp": [9, 13], "long": 9, "should_keep_long_word": 9, "removenonchinesecharacterlmapp": [9, 13], "keep_alphabet": 9, "keep_numb": 9, "keep_punc": 9, "removerepeatsentencesmapp": [9, 13], "ignore_special_charact": 9, "min_repeat_sentence_length": 9, "repeat": 9, "judg": 9, "letter": 9, "removespecificcharsmapp": [9, 13], "chars_to_remov": 9, "removetabletextmapp": [9, 13], "min_col": 9, "max_col": 9, "20": 9, "removewordswithincorrectsubstringsmapp": [9, 13], "substr": 9, "incorrect": 9, "should_keep_word_with_incorrect_substr": 9, "replacecontentmapp": [9, 13], "design": 9, "sentencesplitmapp": [9, 13], "textchunkmapp": [9, 13], "split_pattern": 9, "overlap_len": 9, "len": 9, "forc": 9, "cut": 9, "offerd": 9, "tiktoken": 9, "dashscop": 9, "72b": 9, "recursively_chunk": 9, "get_text_chunk": 9, "videocaptioningfromaudiomapp": [9, 13], "stream": 9, "videocaptioningfromframesmapp": [9, 13], "videocaptioningfromsummarizermapp": [9, 13], "hf_summar": 9, "consider_video_caption_from_video": 9, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "vid_cap_from_vid_arg": 9, "vid_cap_from_frm_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "keep_tag_num": 9, "too": 9, "bring": 9, "frequent": 9, "videocaptioningfromvideomapp": [9, 13], "hf_video_blip": 9, "kpyu": 9, "ego4d": 9, "videoffmpegwrappedmapp": [9, 13], "videofaceblurmapp": [9, 13], "videoremovewatermarkmapp": [9, 13], "roi_str": 9, "roi_typ": 9, "roi_kei": 9, "min_frame_threshold": 9, "detection_method": 9, "pixel_valu": 9, "region": 9, "x1": 9, "y1": 9, "x2": 9, "y2": 9, "roi": 9, "pixel": 9, "corner": 9, "coordin": 9, "wight": 9, "coodin": 9, "pixel_divers": 9, "useless": 9, "videoresizeaspectratiomapp": [9, 13], "increas": 9, "decreas": 9, "enforc": 9, "adjust": 9, "either": 9, "enlarg": 9, "accept": 9, "videoresizeresolutionmapp": [9, 13], "force_original_aspect_ratio": 9, "disabl": 9, "force_divisible_bi": 9, "leav": 9, "super": 9, "deep": 9, "futur": 9, "necessari": 9, "ensur": 9, "integ": 9, "even": 9, "videosplitbydurationmapp": [9, 13], "split_dur": 9, "min_last_split_dur": 9, "discard": 9, "split_videos_by_dur": 9, "videosplitbykeyframemapp": [9, 13], "get_split_key_fram": 9, "videosplitbyscenemapp": [9, 13], "detector": 9, "contentdetector": 9, "27": 9, "min_scene_len": 9, "15": 9, "show_progress": 9, "scene": 9, "avaliable_detector": 9, "adaptivedetector": 9, "window_width": 9, "min_content_v": 9, "luma_onli": 9, "kernel_s": 9, "video_manag": 9, "min_delta_hsv": 9, "thresholddetector": 9, "fade_bia": 9, "add_final_scen": 9, "block_siz": 9, "scenedetect": 9, "progress": 9, "videotaggingfromaudiomapp": [9, 13], "hf_ast": 9, "mit": 9, "ast": 9, "finetun": 9, "audioset": 9, "4593": 9, "__dj__video_audio_tags__": 9, "spectrogram": 9, "hf": 9, "trust": 9, "videotaggingfromframesmapp": [9, 13], "whitespacenormalizationmapp": [9, 13], "0x20": 9, "wikipedia": 9, "wiki": 9, "whitespace_charact": 9, "frequencyspecifiedfieldselector": [10, 13], "top_ratio": 10, "topk": 10, "sort": 10, "frequenc": 10, "descend": 10, "randomselector": [10, 13], "select_ratio": 10, "select_num": 10, "rangespecifiedfieldselector": [10, 13], "lower_percentil": 10, "upper_percentil": 10, "lower_rank": 10, "upper_rank": 10, "smallest": 10, "bound": 10, "upper": 10, "topkspecifiedfieldselector": [10, 13], "kdd": 13, "24": 13, "modal": 13, "foundat": 13, "practic": 13, "data_juic": 13, "core": 13, "index": 13, "page": 13}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [3, 0, 0, "-", "core"], [0, 3, 1, "", "cuda_device_count"], [4, 0, 0, "-", "format"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "get_init_configs"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"], [2, 3, 1, "", "prepare_side_configs"]], "data_juicer.core": [[3, 1, 1, "", "Adapter"], [3, 1, 1, "", "Analyzer"], [3, 1, 1, "", "Executor"], [3, 1, 1, "", "Exporter"], [3, 1, 1, "", "Monitor"], [3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "Tracer"]], "data_juicer.core.Adapter": [[3, 4, 1, "", "MAX_BATCH_SIZE"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "adapt_workloads"], [3, 2, 1, "", "batch_size_strategy"], [3, 2, 1, "", "execute_and_probe"], [3, 2, 1, "", "probe_small_batch"], [3, 2, 1, "", "take_batch"]], "data_juicer.core.Analyzer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"], [3, 2, 1, "", "sample_data"]], "data_juicer.core.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "export_compute_stats"], [3, 2, 1, "", "to_json"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.Monitor": [[3, 4, 1, "", "DYNAMIC_FIELDS"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "analyze_resource_util_list"], [3, 2, 1, "", "analyze_single_resource_util"], [3, 2, 1, "", "draw_resource_util_graph"], [3, 2, 1, "", "monitor_all_resources"], [3, 2, 1, "", "monitor_current_resources"], [3, 2, 1, "", "monitor_func"]], "data_juicer.core.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "load_from_disk"], [3, 2, 1, "", "map"], [3, 2, 1, "", "process"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 1, 1, "", "CsvFormatter"], [4, 1, 1, "", "EmptyFormatter"], [4, 1, 1, "", "JsonFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "MixtureFormatter"], [4, 1, 1, "", "ParquetFormatter"], [4, 1, 1, "", "RayEmptyFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 1, 1, "", "TextFormatter"], [4, 1, 1, "", "TsvFormatter"], [4, 3, 1, "", "load_formatter"]], "data_juicer.format.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.EmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 2, 1, "", "random_sample"]], "data_juicer.format.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.RayEmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats_batched"], [5, 2, 1, "", "compute_stats_single"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "split_text_by_punctuation"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats_single"], [7, 2, 1, "", "process_single"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceCountFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImagePairSimilarityFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoMotionScoreRaftFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceCountFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImagePairSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_flow"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"], [8, 2, 1, "", "setup_model"]], "data_juicer.ops.filter.VideoMotionScoreRaftFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_flow"], [8, 2, 1, "", "setup_model"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "CalibrateQAMapper"], [9, 1, 1, "", "CalibrateQueryMapper"], [9, 1, 1, "", "CalibrateResponseMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "ExtractEntityAttributeMapper"], [9, 1, 1, "", "ExtractEntityRelationMapper"], [9, 1, 1, "", "ExtractEventMapper"], [9, 1, 1, "", "ExtractKeywordMapper"], [9, 1, 1, "", "ExtractNicknameMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "GenerateQAFromExamplesMapper"], [9, 1, 1, "", "GenerateQAFromTextMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "ImageTaggingMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "OptimizeQAMapper"], [9, 1, 1, "", "OptimizeQueryMapper"], [9, 1, 1, "", "OptimizeResponseMapper"], [9, 1, 1, "", "PairPreferenceMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "TextChunkMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.CalibrateQAMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_REFERENCE_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.CalibrateQueryMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.CalibrateResponseMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractEntityAttributeMapper": [[9, 4, 1, "", "DEFAULT_ATTR_PATTERN_TEMPLATE"], [9, 4, 1, "", "DEFAULT_DEMON_PATTERN"], [9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT_TEMPLATE"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractEntityRelationMapper": [[9, 4, 1, "", "DEFAULT_COMPLETION_DELIMITER"], [9, 4, 1, "", "DEFAULT_CONTINUE_PROMPT"], [9, 4, 1, "", "DEFAULT_ENTITY_PATTERN"], [9, 4, 1, "", "DEFAULT_ENTITY_TYPES"], [9, 4, 1, "", "DEFAULT_IF_LOOP_PROMPT"], [9, 4, 1, "", "DEFAULT_PROMPT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_RECORD_DELIMITER"], [9, 4, 1, "", "DEFAULT_RELATION_PATTERN"], [9, 4, 1, "", "DEFAULT_TUPLE_DELIMITER"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "add_message"], [9, 2, 1, "", "light_rag_extraction"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ExtractEventMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractKeywordMapper": [[9, 4, 1, "", "DEFAULT_COMPLETION_DELIMITER"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_PROMPT_TEMPLATE"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ExtractNicknameMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.GenerateQAFromExamplesMapper": [[9, 4, 1, "", "DEFAULT_EXAMPLE_TEMPLATE"], [9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.GenerateQAFromTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageTaggingMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.OptimizeQAMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.OptimizeQueryMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.OptimizeResponseMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.PairPreferenceMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.TextChunkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_text_chunks"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "recursively_chunk"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "RandomSelector"], [10, 1, 1, "", "RangeSpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RandomSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RangeSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute", "5": "py:property"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"], "5": ["py", "property", "Python property"]}, "titleterms": {"data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "analysi": 1, "config": 2, "core": 3, "format": 4, "op": [5, 6, 7, 8, 9, 10], "common": 6, "dedupl": 7, "filter": 8, "mapper": 9, "selector": 10, "tool": 11, "util": 12, "welcom": 13, "data": 13, "juicer": 13, "": 13, "document": 13, "tutori": 13, "api": 13, "refer": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"data_juicer": [[0, "module-data_juicer"], [14, "data-juicer"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "}": [[3, "id1"], [3, "id2"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "Tutorial": [[13, "tutorial"]], "API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]]}, "indexentries": {"cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count"]], "data_juicer": [[0, "module-data_juicer"]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available"]], "module": [[0, "module-data_juicer"], [1, "module-data_juicer.analysis"], [2, "module-data_juicer.config"], [3, "module-data_juicer.core"], [4, "module-data_juicer.format"], [5, "module-data_juicer.ops"], [6, "module-data_juicer.ops.common"], [7, "module-data_juicer.ops.deduplicator"], [8, "module-data_juicer.ops.filter"], [9, "module-data_juicer.ops.mapper"], [10, "module-data_juicer.ops.selector"], [11, "module-data_juicer.tools"], [12, "module-data_juicer.utils"]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__"]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze"]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze"]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze"]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist"]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config"]], "get_init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.get_init_configs"]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs"]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config"]], "prepare_side_configs() (in module data_juicer.config)": [[2, "data_juicer.config.prepare_side_configs"]], "adapter (class in data_juicer.core)": [[3, "data_juicer.core.Adapter"]], "analyzer (class in data_juicer.core)": [[3, "data_juicer.core.Analyzer"]], "dynamic_fields (data_juicer.core.monitor attribute)": [[3, "data_juicer.core.Monitor.DYNAMIC_FIELDS"]], "executor (class in data_juicer.core)": [[3, "data_juicer.core.Executor"]], "exporter (class in data_juicer.core)": [[3, "data_juicer.core.Exporter"]], "gib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.GiB"]], "kib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.KiB"]], "max_batch_size (data_juicer.core.adapter attribute)": [[3, "data_juicer.core.Adapter.MAX_BATCH_SIZE"]], "mib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.MiB"]], "monitor (class in data_juicer.core)": [[3, "data_juicer.core.Monitor"]], "nesteddataset (class in data_juicer.core)": [[3, "data_juicer.core.NestedDataset"]], "tib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.TiB"]], "tracer (class in data_juicer.core)": [[3, "data_juicer.core.Tracer"]], "__init__() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.__init__"]], "__init__() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.__init__"]], "__init__() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.__init__"]], "__init__() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.__init__"]], "__init__() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.__init__"]], "__init__() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.__init__"]], "__init__() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.__init__"]], "adapt_workloads() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.adapt_workloads"]], "add_column() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.add_column"]], "analyze_resource_util_list() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_resource_util_list"]], "analyze_single_resource_util() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_single_resource_util"]], "batch_size_strategy() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.batch_size_strategy"]], "cleanup_cache_files() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.cleanup_cache_files"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "draw_resource_util_graph() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.draw_resource_util_graph"]], "execute_and_probe() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.execute_and_probe"]], "export() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export"]], "export_compute_stats() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export_compute_stats"]], "filter() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.filter"]], "from_dict() (data_juicer.core.nesteddataset class method)": [[3, "data_juicer.core.NestedDataset.from_dict"]], "load_from_disk() (data_juicer.core.nesteddataset static method)": [[3, "data_juicer.core.NestedDataset.load_from_disk"]], "map() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.map"]], "monitor_all_resources() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.monitor_all_resources"]], "monitor_current_resources() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_current_resources"]], "monitor_func() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_func"]], "probe_small_batch() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.probe_small_batch"]], "process() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.process"]], "remove_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.run"]], "run() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.run"]], "sample_data() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.sample_data"]], "select() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select"]], "select_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select_columns"]], "take_batch() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.take_batch"]], "to_json() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_json"]], "to_jsonl() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_mapper"]], "csvformatter (class in data_juicer.format)": [[4, "data_juicer.format.CsvFormatter"]], "emptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.EmptyFormatter"]], "jsonformatter (class in data_juicer.format)": [[4, "data_juicer.format.JsonFormatter"]], "localformatter (class in data_juicer.format)": [[4, "data_juicer.format.LocalFormatter"]], "mixtureformatter (class in data_juicer.format)": [[4, "data_juicer.format.MixtureFormatter"]], "parquetformatter (class in data_juicer.format)": [[4, "data_juicer.format.ParquetFormatter"]], "rayemptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.RayEmptyFormatter"]], "remoteformatter (class in data_juicer.format)": [[4, "data_juicer.format.RemoteFormatter"]], "suffixes (data_juicer.format.csvformatter attribute)": [[4, "data_juicer.format.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.emptyformatter attribute)": [[4, "data_juicer.format.EmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.jsonformatter attribute)": [[4, "data_juicer.format.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquetformatter attribute)": [[4, "data_juicer.format.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.rayemptyformatter attribute)": [[4, "data_juicer.format.RayEmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.textformatter attribute)": [[4, "data_juicer.format.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsvformatter attribute)": [[4, "data_juicer.format.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format)": [[4, "data_juicer.format.TextFormatter"]], "tsvformatter (class in data_juicer.format)": [[4, "data_juicer.format.TsvFormatter"]], "__init__() (data_juicer.format.csvformatter method)": [[4, "data_juicer.format.CsvFormatter.__init__"]], "__init__() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.__init__"]], "__init__() (data_juicer.format.jsonformatter method)": [[4, "data_juicer.format.JsonFormatter.__init__"]], "__init__() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.__init__"]], "__init__() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.__init__"]], "__init__() (data_juicer.format.parquetformatter method)": [[4, "data_juicer.format.ParquetFormatter.__init__"]], "__init__() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.__init__"]], "__init__() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.__init__"]], "__init__() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.__init__"]], "__init__() (data_juicer.format.tsvformatter method)": [[4, "data_juicer.format.TsvFormatter.__init__"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "load_dataset() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format)": [[4, "data_juicer.format.load_formatter"]], "null_value (data_juicer.format.emptyformatter property)": [[4, "data_juicer.format.EmptyFormatter.null_value"]], "null_value (data_juicer.format.rayemptyformatter property)": [[4, "data_juicer.format.RayEmptyFormatter.null_value"]], "random_sample() (data_juicer.format.mixtureformatter class method)": [[4, "data_juicer.format.MixtureFormatter.random_sample"]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator"]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter"]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper"]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector"]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__"]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__"]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__"]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__"]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash"]], "compute_stats_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_single"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops"]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process"]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process"]], "process_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_batched"]], "process_batched() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_batched"]], "process_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_single"]], "process_single() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_single"]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run"]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run"]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run"]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace"]], "split_text_by_punctuation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_text_by_punctuation"]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip"]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator"]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE"]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator"]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator"]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator"]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator"]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator"]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__"]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash"]], "compute_stats_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats_single"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process"]], "process_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process_single"]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter"]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter"]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter"]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter"]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter"]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter"]], "imagefacecountfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter"]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter"]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter"]], "imagepairsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter"]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter"]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter"]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter"]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter"]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter"]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter"]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter"]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter"]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter"]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter"]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter"]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter"]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter"]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter"]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter"]], "videomotionscoreraftfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter"]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter"]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter"]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter"]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter"]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter"]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter"]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__"]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__"]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__"]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscoreraftfilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter.__init__"]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__"]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__"]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__"]], "compute_flow() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_flow"]], "compute_flow() (data_juicer.ops.filter.videomotionscoreraftfilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter.compute_flow"]], "compute_stats_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats_single"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader"]], "process_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process_batched"]], "process_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process_single"]], "process_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process_single"]], "process_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process_single"]], "process_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process_single"]], "process_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process_single"]], "process_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process_single"]], "process_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process_single"]], "process_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process_single"]], "process_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process_single"]], "setup_model() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.setup_model"]], "setup_model() (data_juicer.ops.filter.videomotionscoreraftfilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter.setup_model"]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper"]], "calibrateqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper"]], "calibratequerymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CalibrateQueryMapper"]], "calibrateresponsemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CalibrateResponseMapper"]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper"]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper"]], "default_attr_pattern_template (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_ATTR_PATTERN_TEMPLATE"]], "default_completion_delimiter (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_COMPLETION_DELIMITER"]], "default_completion_delimiter (data_juicer.ops.mapper.extractkeywordmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.DEFAULT_COMPLETION_DELIMITER"]], "default_continue_prompt (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_CONTINUE_PROMPT"]], "default_demon_pattern (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_DEMON_PATTERN"]], "default_entity_pattern (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_ENTITY_PATTERN"]], "default_entity_types (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_ENTITY_TYPES"]], "default_example_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_EXAMPLE_TEMPLATE"]], "default_if_loop_prompt (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_IF_LOOP_PROMPT"]], "default_input_template (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.extracteventmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.extractnicknamemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.pairpreferencemapper attribute)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.DEFAULT_INPUT_TEMPLATE"]], "default_output_pattern (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.extracteventmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.extractkeywordmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.extractnicknamemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.pairpreferencemapper attribute)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.DEFAULT_OUTPUT_PATTERN"]], "default_prompt_template (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_PROMPT_TEMPLATE"]], "default_prompt_template (data_juicer.ops.mapper.extractkeywordmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.DEFAULT_PROMPT_TEMPLATE"]], "default_qa_pair_template (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_QA_PAIR_TEMPLATE"]], "default_qa_pair_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_QA_PAIR_TEMPLATE"]], "default_qa_pair_template (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_QA_PAIR_TEMPLATE"]], "default_record_delimiter (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_RECORD_DELIMITER"]], "default_reference_template (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_REFERENCE_TEMPLATE"]], "default_relation_pattern (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_RELATION_PATTERN"]], "default_system_prompt (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.calibratequerymapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQueryMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.calibrateresponsemapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateResponseMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.extracteventmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.extractnicknamemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizequerymapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizeresponsemapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.pairpreferencemapper attribute)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt_template (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_SYSTEM_PROMPT_TEMPLATE"]], "default_tuple_delimiter (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_TUPLE_DELIMITER"]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper"]], "extractentityattributemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper"]], "extractentityrelationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper"]], "extracteventmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractEventMapper"]], "extractkeywordmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper"]], "extractnicknamemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper"]], "generateqafromexamplesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper"]], "generateqafromtextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper"]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper"]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper"]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper"]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper"]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper"]], "imagetaggingmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper"]], "optimizeqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper"]], "optimizequerymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper"]], "optimizeresponsemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper"]], "pairpreferencemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper"]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper"]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper"]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY"]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper"]], "textchunkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.TextChunkMapper"]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper"]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper"]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper"]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper"]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper"]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper"]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper"]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper"]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper"]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper"]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper"]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper"]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper"]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractentityattributemapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extracteventmapper method)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractkeywordmapper method)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractnicknamemapper method)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__"]], "add_message() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.add_message"]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors"]], "build_input() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.build_input"]], "build_input() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.build_input"]], "build_input() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.build_input"]], "build_input() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.build_input"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame"]], "get_text_chunks() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.get_text_chunks"]], "light_rag_extraction() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.light_rag_extraction"]], "parse_output() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.calibratequerymapper method)": [[9, "data_juicer.ops.mapper.CalibrateQueryMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.calibrateresponsemapper method)": [[9, "data_juicer.ops.mapper.CalibrateResponseMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extractentityattributemapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extracteventmapper method)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extractkeywordmapper method)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extractnicknamemapper method)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizequerymapper method)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizeresponsemapper method)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.parse_output"]], "process_batched() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.extractentityattributemapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.extracteventmapper method)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process_batched"]], "process_single() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.process_single"]], "process_single() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.process_single"]], "process_single() (data_juicer.ops.mapper.extractkeywordmapper method)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.process_single"]], "process_single() (data_juicer.ops.mapper.extractnicknamemapper method)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.process_single"]], "process_single() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.process_single"]], "process_single() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.process_single"]], "process_single() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process_single"]], "recursively_chunk() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.recursively_chunk"]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector"]], "randomselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RandomSelector"]], "rangespecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.__init__"]], "__init__() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.process"]], "process() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]]}}) \ No newline at end of file