From 940b94110d3258b87dc58d74639b219ec15635aa Mon Sep 17 00:00:00 2001
From: yxdyc <yxdyc@users.noreply.github.com>
Date: Fri, 2 Aug 2024 09:48:04 +0000
Subject: [PATCH] deploy: 4d8c521fb8477ceda45f2adad10979624ee0ca6c

---
 .../ops/filter/video_aesthetics_filter.html   |  34 +--
 .../ops/filter/video_duration_filter.html     |   6 +-
 .../video_frames_text_similarity_filter.html  |  35 +--
 .../ops/filter/video_nsfw_filter.html         |  33 +--
 .../ops/filter/video_watermark_filter.html    |  30 ++-
 .../ops/selector/random_selector.html         | 163 +++++++++++++
 .../range_specified_field_selector.html       | 223 ++++++++++++++++++
 .../topk_specified_field_selector.html        |  16 +-
 _modules/index.html                           |   2 +
 data_juicer.ops.filter.html                   |   4 +-
 data_juicer.ops.selector.html                 | 103 ++++++++
 genindex.html                                 |  16 +-
 index.html                                    |   2 +
 objects.inv                                   | Bin 5281 -> 5335 bytes
 searchindex.js                                |   2 +-
 15 files changed, 592 insertions(+), 77 deletions(-)
 create mode 100644 _modules/data_juicer/ops/selector/random_selector.html
 create mode 100644 _modules/data_juicer/ops/selector/range_specified_field_selector.html
diff --git a/_modules/data_juicer/ops/filter/video_aesthetics_filter.html b/_modules/data_juicer/ops/filter/video_aesthetics_filter.html
index e49760577..809086399 100644
--- a/_modules/data_juicer/ops/filter/video_aesthetics_filter.html
+++ b/_modules/data_juicer/ops/filter/video_aesthetics_filter.html
@@ -239,23 +239,27 @@ <h1>Source code for data_juicer.ops.filter.video_aesthetics_filter</h1><div clas
                     <span class="n">sample</span><span class="p">[</span><span class="n">Fields</span><span class="o">.</span><span class="n">context</span><span class="p">][</span><span class="n">sampled_frames_key</span><span class="p">]</span> <span class="o">=</span> <span class="n">frames</span>
             <span class="n">frame_images</span> <span class="o">=</span> <span class="p">[</span><span class="n">frame</span><span class="o">.</span><span class="n">to_image</span><span class="p">()</span> <span class="k">for</span> <span class="n">frame</span> <span class="ow">in</span> <span class="n">frames</span><span class="p">]</span>
 
-            <span class="c1"># compute aesthetics_scores</span>
-            <span class="n">model</span><span class="p">,</span> <span class="n">processor</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model_key</span><span class="p">,</span> <span class="n">rank</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">use_cuda</span><span class="p">())</span>
-            <span class="n">inputs</span> <span class="o">=</span> <span class="n">processor</span><span class="p">(</span><span class="n">images</span><span class="o">=</span><span class="n">frame_images</span><span class="p">,</span>
-                               <span class="n">return_tensors</span><span class="o">=</span><span class="s1">&#39;pt&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
-            <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">():</span>
-                <span class="n">outputs</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="o">**</span><span class="n">inputs</span><span class="p">)</span>
-            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">need_normalized_by_ten</span><span class="p">:</span>
-                <span class="n">aesthetics_score</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">logits</span> <span class="o">/</span> <span class="mf">10.0</span>
-            <span class="k">else</span><span class="p">:</span>
-                <span class="n">aesthetics_score</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">logits</span>
+            <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">frame_images</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
+                <span class="c1"># compute aesthetics_scores</span>
+                <span class="n">model</span><span class="p">,</span> <span class="n">processor</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model_key</span><span class="p">,</span> <span class="n">rank</span><span class="o">=</span><span class="n">rank</span><span class="p">)</span>
+                <span class="n">inputs</span> <span class="o">=</span> <span class="n">processor</span><span class="p">(</span><span class="n">images</span><span class="o">=</span><span class="n">frame_images</span><span class="p">,</span>
+                                   <span class="n">return_tensors</span><span class="o">=</span><span class="s1">&#39;pt&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+                <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">():</span>
+                    <span class="n">outputs</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="o">**</span><span class="n">inputs</span><span class="p">)</span>
+                <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">need_normalized_by_ten</span><span class="p">:</span>
+                    <span class="n">aesthetics_score</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">logits</span> <span class="o">/</span> <span class="mf">10.0</span>
+                <span class="k">else</span><span class="p">:</span>
+                    <span class="n">aesthetics_score</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">logits</span>
 
-            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;avg&#39;</span><span class="p">:</span>
-                <span class="n">aesthetics_score</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">aesthetics_score</span><span class="o">.</span><span class="n">mean</span><span class="p">())</span>
-            <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;max&#39;</span><span class="p">:</span>
-                <span class="n">aesthetics_score</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">aesthetics_score</span><span class="o">.</span><span class="n">max</span><span class="p">())</span>
+                <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;avg&#39;</span><span class="p">:</span>
+                    <span class="n">aesthetics_score</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">aesthetics_score</span><span class="o">.</span><span class="n">mean</span><span class="p">())</span>
+                <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;max&#39;</span><span class="p">:</span>
+                    <span class="n">aesthetics_score</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">aesthetics_score</span><span class="o">.</span><span class="n">max</span><span class="p">())</span>
+                <span class="k">else</span><span class="p">:</span>
+                    <span class="n">aesthetics_score</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">aesthetics_score</span><span class="o">.</span><span class="n">min</span><span class="p">())</span>
             <span class="k">else</span><span class="p">:</span>
-                <span class="n">aesthetics_score</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">aesthetics_score</span><span class="o">.</span><span class="n">min</span><span class="p">())</span>
+                <span class="n">aesthetics_score</span> <span class="o">=</span> <span class="mf">0.0</span>
+
             <span class="n">aesthetics_scores</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">aesthetics_score</span><span class="p">)</span>
 
         <span class="n">logger</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;aesthetics_score: </span><span class="si">{</span><span class="n">aesthetics_scores</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">)</span>
diff --git a/_modules/data_juicer/ops/filter/video_duration_filter.html b/_modules/data_juicer/ops/filter/video_duration_filter.html
index 61458e793..27c541de5 100644
--- a/_modules/data_juicer/ops/filter/video_duration_filter.html
+++ b/_modules/data_juicer/ops/filter/video_duration_filter.html
@@ -85,7 +85,7 @@ <h1>Source code for data_juicer.ops.filter.video_duration_filter</h1><div class=
 <span></span><span class="kn">import</span> <span class="nn">sys</span>
 
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
-<span class="kn">from</span> <span class="nn">jsonargparse.typing</span> <span class="kn">import</span> <span class="n">NonNegativeInt</span>
+<span class="kn">from</span> <span class="nn">jsonargparse.typing</span> <span class="kn">import</span> <span class="n">NonNegativeFloat</span>
 
 <span class="kn">from</span> <span class="nn">data_juicer.utils.constant</span> <span class="kn">import</span> <span class="n">Fields</span><span class="p">,</span> <span class="n">StatsKeys</span>
 <span class="kn">from</span> <span class="nn">data_juicer.utils.mm_utils</span> <span class="kn">import</span> <span class="p">(</span><span class="n">close_video</span><span class="p">,</span> <span class="n">load_data_with_context</span><span class="p">,</span>
@@ -104,8 +104,8 @@ <h1>Source code for data_juicer.ops.filter.video_duration_filter</h1><div class=
 <span class="sd">    &quot;&quot;&quot;</span>
 
 <div class="viewcode-block" id="VideoDurationFilter.__init__"><a class="viewcode-back" href="../../../../data_juicer.ops.filter.html#data_juicer.ops.filter.VideoDurationFilter.__init__">[docs]</a>    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
-                 <span class="n">min_duration</span><span class="p">:</span> <span class="n">NonNegativeInt</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
-                 <span class="n">max_duration</span><span class="p">:</span> <span class="n">NonNegativeInt</span> <span class="o">=</span> <span class="n">sys</span><span class="o">.</span><span class="n">maxsize</span><span class="p">,</span>
+                 <span class="n">min_duration</span><span class="p">:</span> <span class="n">NonNegativeFloat</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
+                 <span class="n">max_duration</span><span class="p">:</span> <span class="n">NonNegativeFloat</span> <span class="o">=</span> <span class="n">sys</span><span class="o">.</span><span class="n">maxsize</span><span class="p">,</span>
                  <span class="n">any_or_all</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s1">&#39;any&#39;</span><span class="p">,</span>
                  <span class="o">*</span><span class="n">args</span><span class="p">,</span>
                  <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
diff --git a/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html b/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html
index 9e4641138..a60d7b2e5 100644
--- a/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html
+++ b/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html
@@ -255,23 +255,26 @@ <h1>Source code for data_juicer.ops.filter.video_frames_text_similarity_filter</
                             <span class="n">image</span> <span class="o">=</span> <span class="n">ImageOps</span><span class="o">.</span><span class="n">flip</span><span class="p">(</span><span class="n">image</span><span class="p">)</span>
                         <span class="n">video_frame_images_chunk</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">image</span><span class="p">)</span>
 
-                <span class="n">inputs</span> <span class="o">=</span> <span class="n">processor</span><span class="p">(</span><span class="n">text</span><span class="o">=</span><span class="n">text_chunk</span><span class="p">,</span>
-                                   <span class="n">images</span><span class="o">=</span><span class="n">video_frame_images_chunk</span><span class="p">,</span>
-                                   <span class="n">return_tensors</span><span class="o">=</span><span class="s1">&#39;pt&#39;</span><span class="p">,</span>
-                                   <span class="n">truncation</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
-                                   <span class="n">max_length</span><span class="o">=</span><span class="n">model</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">text_config</span><span class="o">.</span>
-                                   <span class="n">max_position_embeddings</span><span class="p">,</span>
-                                   <span class="n">padding</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
-
-                <span class="n">outputs</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="o">**</span><span class="n">inputs</span><span class="p">)</span>
-                <span class="n">chunk_logits</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">logits_per_text</span> <span class="o">/</span> <span class="mf">100.0</span>
-
-                <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;avg&#39;</span><span class="p">:</span>
-                    <span class="n">chunk_similarity</span> <span class="o">=</span> <span class="n">chunk_logits</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
-                <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;max&#39;</span><span class="p">:</span>
-                    <span class="n">chunk_similarity</span> <span class="o">=</span> <span class="n">chunk_logits</span><span class="o">.</span><span class="n">max</span><span class="p">()</span>
+                <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">video_frame_images_chunk</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
+                    <span class="n">inputs</span> <span class="o">=</span> <span class="n">processor</span><span class="p">(</span><span class="n">text</span><span class="o">=</span><span class="n">text_chunk</span><span class="p">,</span>
+                                       <span class="n">images</span><span class="o">=</span><span class="n">video_frame_images_chunk</span><span class="p">,</span>
+                                       <span class="n">return_tensors</span><span class="o">=</span><span class="s1">&#39;pt&#39;</span><span class="p">,</span>
+                                       <span class="n">truncation</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+                                       <span class="n">max_length</span><span class="o">=</span><span class="n">model</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">text_config</span><span class="o">.</span>
+                                       <span class="n">max_position_embeddings</span><span class="p">,</span>
+                                       <span class="n">padding</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+
+                    <span class="n">outputs</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="o">**</span><span class="n">inputs</span><span class="p">)</span>
+                    <span class="n">chunk_logits</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">logits_per_text</span> <span class="o">/</span> <span class="mf">100.0</span>
+
+                    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;avg&#39;</span><span class="p">:</span>
+                        <span class="n">chunk_similarity</span> <span class="o">=</span> <span class="n">chunk_logits</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
+                    <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;max&#39;</span><span class="p">:</span>
+                        <span class="n">chunk_similarity</span> <span class="o">=</span> <span class="n">chunk_logits</span><span class="o">.</span><span class="n">max</span><span class="p">()</span>
+                    <span class="k">else</span><span class="p">:</span>
+                        <span class="n">chunk_similarity</span> <span class="o">=</span> <span class="n">chunk_logits</span><span class="o">.</span><span class="n">min</span><span class="p">()</span>
                 <span class="k">else</span><span class="p">:</span>
-                    <span class="n">chunk_similarity</span> <span class="o">=</span> <span class="n">chunk_logits</span><span class="o">.</span><span class="n">min</span><span class="p">()</span>
+                    <span class="n">chunk_similarity</span> <span class="o">=</span> <span class="mf">0.0</span>
 
                 <span class="n">similarity</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">chunk_similarity</span><span class="p">))</span>
             <span class="n">offset</span> <span class="o">+=</span> <span class="n">count</span>
diff --git a/_modules/data_juicer/ops/filter/video_nsfw_filter.html b/_modules/data_juicer/ops/filter/video_nsfw_filter.html
index 81ab4494b..856298073 100644
--- a/_modules/data_juicer/ops/filter/video_nsfw_filter.html
+++ b/_modules/data_juicer/ops/filter/video_nsfw_filter.html
@@ -219,21 +219,26 @@ <h1>Source code for data_juicer.ops.filter.video_nsfw_filter</h1><div class="hig
                     <span class="n">sample</span><span class="p">[</span><span class="n">Fields</span><span class="o">.</span><span class="n">context</span><span class="p">][</span><span class="n">sampled_frames_key</span><span class="p">]</span> <span class="o">=</span> <span class="n">frames</span>
 
             <span class="n">frame_images</span> <span class="o">=</span> <span class="p">[</span><span class="n">frame</span><span class="o">.</span><span class="n">to_image</span><span class="p">()</span> <span class="k">for</span> <span class="n">frame</span> <span class="ow">in</span> <span class="n">frames</span><span class="p">]</span>
-            <span class="n">inputs</span> <span class="o">=</span> <span class="n">processor</span><span class="p">(</span><span class="n">images</span><span class="o">=</span><span class="n">frame_images</span><span class="p">,</span> <span class="n">return_tensors</span><span class="o">=</span><span class="s1">&#39;pt&#39;</span><span class="p">)</span>
-            <span class="n">inputs</span> <span class="o">=</span> <span class="n">inputs</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
-            <span class="n">outputs</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="o">**</span><span class="n">inputs</span><span class="p">)</span>
-            <span class="n">logits</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">logits</span>
-            <span class="n">cur_scores</span> <span class="o">=</span> <span class="p">[</span>
-                <span class="n">scores</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="k">for</span> <span class="n">scores</span> <span class="ow">in</span> <span class="n">torch</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">logits</span><span class="p">,</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
-            <span class="p">]</span>
-            <span class="n">cur_scores</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">(</span><span class="n">cur_scores</span><span class="p">)</span>
-
-            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;avg&#39;</span><span class="p">:</span>
-                <span class="n">cur_score</span> <span class="o">=</span> <span class="n">cur_scores</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
-            <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;max&#39;</span><span class="p">:</span>
-                <span class="n">cur_score</span> <span class="o">=</span> <span class="n">cur_scores</span><span class="o">.</span><span class="n">max</span><span class="p">()</span>
+
+            <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">frame_images</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
+                <span class="n">inputs</span> <span class="o">=</span> <span class="n">processor</span><span class="p">(</span><span class="n">images</span><span class="o">=</span><span class="n">frame_images</span><span class="p">,</span> <span class="n">return_tensors</span><span class="o">=</span><span class="s1">&#39;pt&#39;</span><span class="p">)</span>
+                <span class="n">inputs</span> <span class="o">=</span> <span class="n">inputs</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+                <span class="n">outputs</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="o">**</span><span class="n">inputs</span><span class="p">)</span>
+                <span class="n">logits</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">logits</span>
+                <span class="n">cur_scores</span> <span class="o">=</span> <span class="p">[</span>
+                    <span class="n">scores</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="k">for</span> <span class="n">scores</span> <span class="ow">in</span> <span class="n">torch</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">logits</span><span class="p">,</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
+                <span class="p">]</span>
+                <span class="n">cur_scores</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">(</span><span class="n">cur_scores</span><span class="p">)</span>
+
+                <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;avg&#39;</span><span class="p">:</span>
+                    <span class="n">cur_score</span> <span class="o">=</span> <span class="n">cur_scores</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
+                <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;max&#39;</span><span class="p">:</span>
+                    <span class="n">cur_score</span> <span class="o">=</span> <span class="n">cur_scores</span><span class="o">.</span><span class="n">max</span><span class="p">()</span>
+                <span class="k">else</span><span class="p">:</span>
+                    <span class="n">cur_score</span> <span class="o">=</span> <span class="n">cur_scores</span><span class="o">.</span><span class="n">min</span><span class="p">()</span>
             <span class="k">else</span><span class="p">:</span>
-                <span class="n">cur_score</span> <span class="o">=</span> <span class="n">cur_scores</span><span class="o">.</span><span class="n">min</span><span class="p">()</span>
+                <span class="n">cur_score</span> <span class="o">=</span> <span class="mf">0.0</span>
+
             <span class="n">nsfw_scores</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">cur_score</span><span class="p">))</span>
 
         <span class="n">sample</span><span class="p">[</span><span class="n">Fields</span><span class="o">.</span><span class="n">stats</span><span class="p">][</span><span class="n">StatsKeys</span><span class="o">.</span><span class="n">video_nsfw_score</span><span class="p">]</span> <span class="o">=</span> <span class="n">nsfw_scores</span>
diff --git a/_modules/data_juicer/ops/filter/video_watermark_filter.html b/_modules/data_juicer/ops/filter/video_watermark_filter.html
index c92ced165..8ae26e9a9 100644
--- a/_modules/data_juicer/ops/filter/video_watermark_filter.html
+++ b/_modules/data_juicer/ops/filter/video_watermark_filter.html
@@ -222,19 +222,25 @@ <h1>Source code for data_juicer.ops.filter.video_watermark_filter</h1><div class
                     <span class="n">sample</span><span class="p">[</span><span class="n">Fields</span><span class="o">.</span><span class="n">context</span><span class="p">][</span><span class="n">sampled_frames_key</span><span class="p">]</span> <span class="o">=</span> <span class="n">frames</span>
 
             <span class="n">frame_images</span> <span class="o">=</span> <span class="p">[</span><span class="n">frame</span><span class="o">.</span><span class="n">to_image</span><span class="p">()</span> <span class="k">for</span> <span class="n">frame</span> <span class="ow">in</span> <span class="n">frames</span><span class="p">]</span>
-            <span class="n">inputs</span> <span class="o">=</span> <span class="n">processor</span><span class="p">(</span><span class="n">images</span><span class="o">=</span><span class="n">frame_images</span><span class="p">,</span> <span class="n">return_tensors</span><span class="o">=</span><span class="s1">&#39;pt&#39;</span><span class="p">)</span>
-            <span class="n">inputs</span> <span class="o">=</span> <span class="n">inputs</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
-            <span class="n">outputs</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="o">**</span><span class="n">inputs</span><span class="p">)</span>
-            <span class="n">logits</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">logits</span>
-            <span class="n">cur_probs</span> <span class="o">=</span> <span class="p">[</span><span class="n">probs</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="k">for</span> <span class="n">probs</span> <span class="ow">in</span> <span class="n">torch</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">logits</span><span class="p">,</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)]</span>
-            <span class="n">cur_probs</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">(</span><span class="n">cur_probs</span><span class="p">)</span>
-
-            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;avg&#39;</span><span class="p">:</span>
-                <span class="n">cur_prob</span> <span class="o">=</span> <span class="n">cur_probs</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
-            <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;max&#39;</span><span class="p">:</span>
-                <span class="n">cur_prob</span> <span class="o">=</span> <span class="n">cur_probs</span><span class="o">.</span><span class="n">max</span><span class="p">()</span>
+
+            <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">frame_images</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
+                <span class="n">inputs</span> <span class="o">=</span> <span class="n">processor</span><span class="p">(</span><span class="n">images</span><span class="o">=</span><span class="n">frame_images</span><span class="p">,</span> <span class="n">return_tensors</span><span class="o">=</span><span class="s1">&#39;pt&#39;</span><span class="p">)</span>
+                <span class="n">inputs</span> <span class="o">=</span> <span class="n">inputs</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+                <span class="n">outputs</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="o">**</span><span class="n">inputs</span><span class="p">)</span>
+                <span class="n">logits</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">logits</span>
+                <span class="n">cur_probs</span> <span class="o">=</span> <span class="p">[</span>
+                    <span class="n">probs</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="k">for</span> <span class="n">probs</span> <span class="ow">in</span> <span class="n">torch</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">logits</span><span class="p">,</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
+                <span class="p">]</span>
+                <span class="n">cur_probs</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">(</span><span class="n">cur_probs</span><span class="p">)</span>
+
+                <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;avg&#39;</span><span class="p">:</span>
+                    <span class="n">cur_prob</span> <span class="o">=</span> <span class="n">cur_probs</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
+                <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">reduce_mode</span> <span class="o">==</span> <span class="s1">&#39;max&#39;</span><span class="p">:</span>
+                    <span class="n">cur_prob</span> <span class="o">=</span> <span class="n">cur_probs</span><span class="o">.</span><span class="n">max</span><span class="p">()</span>
+                <span class="k">else</span><span class="p">:</span>
+                    <span class="n">cur_prob</span> <span class="o">=</span> <span class="n">cur_probs</span><span class="o">.</span><span class="n">min</span><span class="p">()</span>
             <span class="k">else</span><span class="p">:</span>
-                <span class="n">cur_prob</span> <span class="o">=</span> <span class="n">cur_probs</span><span class="o">.</span><span class="n">min</span><span class="p">()</span>
+                <span class="n">cur_prob</span> <span class="o">=</span> <span class="mf">0.0</span>
             <span class="n">watermark_probs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">cur_prob</span><span class="p">))</span>
 
         <span class="n">sample</span><span class="p">[</span><span class="n">Fields</span><span class="o">.</span><span class="n">stats</span><span class="p">][</span><span class="n">StatsKeys</span><span class="o">.</span><span class="n">video_watermark_prob</span><span class="p">]</span> <span class="o">=</span> <span class="n">watermark_probs</span>
diff --git a/_modules/data_juicer/ops/selector/random_selector.html b/_modules/data_juicer/ops/selector/random_selector.html
new file mode 100644
index 000000000..175203884
--- /dev/null
+++ b/_modules/data_juicer/ops/selector/random_selector.html
@@ -0,0 +1,163 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en">
+<head>
+  <meta charset="utf-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>data_juicer.ops.selector.random_selector &mdash; data_juicer 0.2.0 documentation</title>
+      <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
+      <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=19f00094" />
+
+  
+  <!--[if lt IE 9]>
+    <script src="../../../../_static/js/html5shiv.min.js"></script>
+  <![endif]-->
+  
+        <script data-url_root="../../../../" id="documentation_options" src="../../../../_static/documentation_options.js?v=b1f64a84"></script>
+        <script src="../../../../_static/doctools.js?v=888ff710"></script>
+        <script src="../../../../_static/sphinx_highlight.js?v=4825356b"></script>
+    <script src="../../../../_static/js/theme.js"></script>
+    <link rel="index" title="Index" href="../../../../genindex.html" />
+    <link rel="search" title="Search" href="../../../../search.html" /> 
+</head>
+
+<body class="wy-body-for-nav"> 
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
+      <div class="wy-side-scroll">
+        <div class="wy-side-nav-search" >
+
+          
+          
+          <a href="../../../../index.html" class="icon icon-home">
+            data_juicer
+          </a>
+              <div class="version">
+                0.2.0
+              </div>
+<div role="search">
+  <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
+    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
+    <input type="hidden" name="check_keywords" value="yes" />
+    <input type="hidden" name="area" value="default" />
+  </form>
+</div>
+        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <p class="caption" role="heading"><span class="caption-text">API Reference</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.core.html">data_juicer.core</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.ops.html">data_juicer.ops</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.ops.filter.html">data_juicer.ops.filter</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.ops.mapper.html">data_juicer.ops.mapper</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.ops.deduplicator.html">data_juicer.ops.deduplicator</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.ops.selector.html">data_juicer.ops.selector</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.ops.common.html">data_juicer.ops.common</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.analysis.html">data_juicer.analysis</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.config.html">data_juicer.config</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.format.html">data_juicer.format</a></li>
+</ul>
+
+        </div>
+      </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../../../../index.html">data_juicer</a>
+      </nav>
+
+      <div class="wy-nav-content">
+        <div class="rst-content">
+          <div role="navigation" aria-label="Page navigation">
+  <ul class="wy-breadcrumbs">
+      <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
+          <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
+          <li class="breadcrumb-item"><a href="../../../data_juicer.html">data_juicer</a></li>
+      <li class="breadcrumb-item active">data_juicer.ops.selector.random_selector</li>
+      <li class="wy-breadcrumbs-aside">
+      </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+             
+  <h1>Source code for data_juicer.ops.selector.random_selector</h1><div class="highlight"><pre>
+<span></span><span class="kn">from</span> <span class="nn">jsonargparse.typing</span> <span class="kn">import</span> <span class="n">ClosedUnitInterval</span><span class="p">,</span> <span class="n">PositiveInt</span>
+
+<span class="kn">from</span> <span class="nn">data_juicer.format.mixture_formatter</span> <span class="kn">import</span> <span class="n">MixtureFormatter</span>
+
+<span class="kn">from</span> <span class="nn">..base_op</span> <span class="kn">import</span> <span class="n">OPERATORS</span><span class="p">,</span> <span class="n">Selector</span>
+
+
+<div class="viewcode-block" id="RandomSelector"><a class="viewcode-back" href="../../../../data_juicer.ops.selector.html#data_juicer.ops.selector.RandomSelector">[docs]</a><span class="nd">@OPERATORS</span><span class="o">.</span><span class="n">register_module</span><span class="p">(</span><span class="s1">&#39;random_selector&#39;</span><span class="p">)</span>
+<span class="k">class</span> <span class="nc">RandomSelector</span><span class="p">(</span><span class="n">Selector</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Selector to random select samples. &quot;&quot;&quot;</span>
+
+<div class="viewcode-block" id="RandomSelector.__init__"><a class="viewcode-back" href="../../../../data_juicer.ops.selector.html#data_juicer.ops.selector.RandomSelector.__init__">[docs]</a>    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
+                 <span class="n">select_ratio</span><span class="p">:</span> <span class="n">ClosedUnitInterval</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+                 <span class="n">select_num</span><span class="p">:</span> <span class="n">PositiveInt</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+                 <span class="o">*</span><span class="n">args</span><span class="p">,</span>
+                 <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Initialization method.</span>
+
+<span class="sd">        :param select_ratio: The ratio to select. When both</span>
+<span class="sd">            select_ratio and select_num are set, the value corresponding</span>
+<span class="sd">            to the smaller number of samples will be applied.</span>
+<span class="sd">        :param select_num: The number of samples to select. When both</span>
+<span class="sd">            select_ratio and select_num are set, the value corresponding</span>
+<span class="sd">            to the smaller number of samples will be applied.</span>
+<span class="sd">        :param args: extra args</span>
+<span class="sd">        :param kwargs: extra args</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">select_ratio</span> <span class="o">=</span> <span class="n">select_ratio</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">select_num</span> <span class="o">=</span> <span class="n">select_num</span></div>
+
+<div class="viewcode-block" id="RandomSelector.process"><a class="viewcode-back" href="../../../../data_juicer.ops.selector.html#data_juicer.ops.selector.RandomSelector.process">[docs]</a>    <span class="k">def</span> <span class="nf">process</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">):</span>
+        <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="mi">1</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">dataset</span>
+
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">select_ratio</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">select_num</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">dataset</span>
+
+        <span class="n">select_num</span> <span class="o">=</span> <span class="mi">0</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">select_ratio</span><span class="p">:</span>
+            <span class="n">select_num</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">select_num</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">select_num</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">select_ratio</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">dataset</span><span class="p">))</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">select_num</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">select_num</span> <span class="o">&lt;</span> <span class="n">select_num</span><span class="p">:</span>
+                <span class="n">select_num</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">select_num</span>
+
+        <span class="k">return</span> <span class="n">MixtureFormatter</span><span class="o">.</span><span class="n">random_sample</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span>
+                                              <span class="n">sample_number</span><span class="o">=</span><span class="n">select_num</span><span class="p">)</span></div></div>
+</pre></div>
+
+           </div>
+          </div>
+          <footer>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <p>&#169; Copyright 2024, Data-Juicer Team.</p>
+  </div>
+
+  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
+    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
+    provided by <a href="https://readthedocs.org">Read the Docs</a>.
+   
+
+</footer>
+        </div>
+      </div>
+    </section>
+  </div>
+  <script>
+      jQuery(function () {
+          SphinxRtdTheme.Navigation.enable(true);
+      });
+  </script> 
+
+</body>
+</html>
\ No newline at end of file
diff --git a/_modules/data_juicer/ops/selector/range_specified_field_selector.html b/_modules/data_juicer/ops/selector/range_specified_field_selector.html
new file mode 100644
index 000000000..50404a19c
--- /dev/null
+++ b/_modules/data_juicer/ops/selector/range_specified_field_selector.html
@@ -0,0 +1,223 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en">
+<head>
+  <meta charset="utf-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>data_juicer.ops.selector.range_specified_field_selector &mdash; data_juicer 0.2.0 documentation</title>
+      <link rel="stylesheet" type="text/css" href="../../../../_static/pygments.css?v=80d5e7a1" />
+      <link rel="stylesheet" type="text/css" href="../../../../_static/css/theme.css?v=19f00094" />
+
+  
+  <!--[if lt IE 9]>
+    <script src="../../../../_static/js/html5shiv.min.js"></script>
+  <![endif]-->
+  
+        <script data-url_root="../../../../" id="documentation_options" src="../../../../_static/documentation_options.js?v=b1f64a84"></script>
+        <script src="../../../../_static/doctools.js?v=888ff710"></script>
+        <script src="../../../../_static/sphinx_highlight.js?v=4825356b"></script>
+    <script src="../../../../_static/js/theme.js"></script>
+    <link rel="index" title="Index" href="../../../../genindex.html" />
+    <link rel="search" title="Search" href="../../../../search.html" /> 
+</head>
+
+<body class="wy-body-for-nav"> 
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
+      <div class="wy-side-scroll">
+        <div class="wy-side-nav-search" >
+
+          
+          
+          <a href="../../../../index.html" class="icon icon-home">
+            data_juicer
+          </a>
+              <div class="version">
+                0.2.0
+              </div>
+<div role="search">
+  <form id="rtd-search-form" class="wy-form" action="../../../../search.html" method="get">
+    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
+    <input type="hidden" name="check_keywords" value="yes" />
+    <input type="hidden" name="area" value="default" />
+  </form>
+</div>
+        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <p class="caption" role="heading"><span class="caption-text">API Reference</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.core.html">data_juicer.core</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.ops.html">data_juicer.ops</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.ops.filter.html">data_juicer.ops.filter</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.ops.mapper.html">data_juicer.ops.mapper</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.ops.deduplicator.html">data_juicer.ops.deduplicator</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.ops.selector.html">data_juicer.ops.selector</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.ops.common.html">data_juicer.ops.common</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.analysis.html">data_juicer.analysis</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.config.html">data_juicer.config</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../data_juicer.format.html">data_juicer.format</a></li>
+</ul>
+
+        </div>
+      </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../../../../index.html">data_juicer</a>
+      </nav>
+
+      <div class="wy-nav-content">
+        <div class="rst-content">
+          <div role="navigation" aria-label="Page navigation">
+  <ul class="wy-breadcrumbs">
+      <li><a href="../../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
+          <li class="breadcrumb-item"><a href="../../../index.html">Module code</a></li>
+          <li class="breadcrumb-item"><a href="../../../data_juicer.html">data_juicer</a></li>
+      <li class="breadcrumb-item active">data_juicer.ops.selector.range_specified_field_selector</li>
+      <li class="wy-breadcrumbs-aside">
+      </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+             
+  <h1>Source code for data_juicer.ops.selector.range_specified_field_selector</h1><div class="highlight"><pre>
+<span></span><span class="kn">import</span> <span class="nn">heapq</span>
+
+<span class="kn">from</span> <span class="nn">jsonargparse.typing</span> <span class="kn">import</span> <span class="n">ClosedUnitInterval</span><span class="p">,</span> <span class="n">PositiveInt</span>
+
+<span class="kn">from</span> <span class="nn">data_juicer.utils.common_utils</span> <span class="kn">import</span> <span class="n">stats_to_number</span>
+
+<span class="kn">from</span> <span class="nn">..base_op</span> <span class="kn">import</span> <span class="n">OPERATORS</span><span class="p">,</span> <span class="n">Selector</span>
+
+
+<div class="viewcode-block" id="RangeSpecifiedFieldSelector"><a class="viewcode-back" href="../../../../data_juicer.ops.selector.html#data_juicer.ops.selector.RangeSpecifiedFieldSelector">[docs]</a><span class="nd">@OPERATORS</span><span class="o">.</span><span class="n">register_module</span><span class="p">(</span><span class="s1">&#39;range_specified_field_selector&#39;</span><span class="p">)</span>
+<span class="k">class</span> <span class="nc">RangeSpecifiedFieldSelector</span><span class="p">(</span><span class="n">Selector</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Selector to select a range of samples based on the sorted</span>
+<span class="sd">    specified field value from smallest to largest. &quot;&quot;&quot;</span>
+
+<div class="viewcode-block" id="RangeSpecifiedFieldSelector.__init__"><a class="viewcode-back" href="../../../../data_juicer.ops.selector.html#data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__">[docs]</a>    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
+                 <span class="n">field_key</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s1">&#39;&#39;</span><span class="p">,</span>
+                 <span class="n">lower_percentile</span><span class="p">:</span> <span class="n">ClosedUnitInterval</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+                 <span class="n">upper_percentile</span><span class="p">:</span> <span class="n">ClosedUnitInterval</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+                 <span class="n">lower_rank</span><span class="p">:</span> <span class="n">PositiveInt</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+                 <span class="n">upper_rank</span><span class="p">:</span> <span class="n">PositiveInt</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+                 <span class="o">*</span><span class="n">args</span><span class="p">,</span>
+                 <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Initialization method.</span>
+
+<span class="sd">        :param field_key: Selector based on the specified value</span>
+<span class="sd">            corresponding to the target key. The target key</span>
+<span class="sd">            corresponding to multi-level field information need to be</span>
+<span class="sd">            separated by &#39;.&#39;.</span>
+<span class="sd">        :param lower_percentile: The lower bound of the percentile to</span>
+<span class="sd">            be sample, samples will be selected if their specified field</span>
+<span class="sd">            values are greater than this lower bound. When both</span>
+<span class="sd">            lower_percentile and lower_rank are set, the value corresponding</span>
+<span class="sd">            to the larger number of samples will be applied.</span>
+<span class="sd">        :param upper_percentile: The upper bound of the percentile to</span>
+<span class="sd">            be sample, samples will be selected if their specified field</span>
+<span class="sd">            values are less or equal to the upper bound. When both</span>
+<span class="sd">            upper_percentile and upper_rank are set, the value corresponding</span>
+<span class="sd">            to the smaller number of samples will be applied.</span>
+<span class="sd">        :param lower_rank: The lower bound of the rank to be sample,</span>
+<span class="sd">            samples will be selected if their specified field values are</span>
+<span class="sd">            greater than this lower bound. When both lower_percentile and</span>
+<span class="sd">            lower_rank are set, the value corresponding to the larger number</span>
+<span class="sd">            of samples will be applied.</span>
+<span class="sd">        :param upper_rank: The upper bound of the rank to be sample,</span>
+<span class="sd">            samples will be selected if their specified field values are</span>
+<span class="sd">            less or equal to the upper bound. When both upper_percentile and</span>
+<span class="sd">            upper_rank are set, the value corresponding to the smaller number</span>
+<span class="sd">            of samples will be applied.</span>
+<span class="sd">        :param args: extra args</span>
+<span class="sd">        :param kwargs: extra args</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">field_key</span> <span class="o">=</span> <span class="n">field_key</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">lower_percentile</span> <span class="o">=</span> <span class="n">lower_percentile</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">upper_percentile</span> <span class="o">=</span> <span class="n">upper_percentile</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">lower_rank</span> <span class="o">=</span> <span class="n">lower_rank</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">upper_rank</span> <span class="o">=</span> <span class="n">upper_rank</span></div>
+
+<div class="viewcode-block" id="RangeSpecifiedFieldSelector.process"><a class="viewcode-back" href="../../../../data_juicer.ops.selector.html#data_juicer.ops.selector.RangeSpecifiedFieldSelector.process">[docs]</a>    <span class="k">def</span> <span class="nf">process</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dataset</span><span class="p">):</span>
+        <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="mi">1</span> <span class="ow">or</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">field_key</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">dataset</span>
+
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">lower_percentile</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">lower_rank</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">dataset</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">upper_percentile</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">upper_rank</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">dataset</span>
+
+        <span class="n">lower_bound</span><span class="p">,</span> <span class="n">upper_bound</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">lower_percentile</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="n">lower_bound</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">lower_percentile</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">dataset</span><span class="p">))</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">lower_rank</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="n">lower_bound</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="n">lower_bound</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">lower_rank</span><span class="p">)</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">upper_percentile</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="n">upper_bound</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">upper_percentile</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">dataset</span><span class="p">))</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">upper_rank</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="n">upper_bound</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">upper_bound</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">upper_rank</span><span class="p">)</span>
+        <span class="n">upper_bound</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="n">lower_bound</span><span class="p">,</span> <span class="n">upper_bound</span><span class="p">)</span>
+
+        <span class="n">field_keys</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">field_key</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s1">&#39;.&#39;</span><span class="p">)</span>
+        <span class="k">assert</span> <span class="n">field_keys</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="ow">in</span> <span class="n">dataset</span><span class="o">.</span><span class="n">features</span><span class="o">.</span><span class="n">keys</span><span class="p">(</span>
+        <span class="p">),</span> <span class="s2">&quot;&#39;</span><span class="si">{}</span><span class="s2">&#39; not in </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">field_keys</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">dataset</span><span class="o">.</span><span class="n">features</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
+
+        <span class="k">def</span> <span class="nf">get_field_value_list</span><span class="p">(</span><span class="n">cur_dataset</span><span class="p">,</span> <span class="n">field_keys</span><span class="p">):</span>
+            <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">field_keys</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
+                <span class="n">field_value_list</span> <span class="o">=</span> <span class="n">cur_dataset</span><span class="p">[</span><span class="n">field_keys</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span>
+            <span class="k">else</span><span class="p">:</span>
+                <span class="n">field_value_list</span> <span class="o">=</span> <span class="p">[]</span>
+                <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">cur_dataset</span><span class="p">[</span><span class="n">field_keys</span><span class="p">[</span><span class="mi">0</span><span class="p">]]:</span>
+                    <span class="n">field_value</span> <span class="o">=</span> <span class="n">item</span>
+                    <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">field_keys</span><span class="p">[</span><span class="mi">1</span><span class="p">:]:</span>
+                        <span class="k">assert</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">field_value</span><span class="o">.</span><span class="n">keys</span><span class="p">(</span>
+                        <span class="p">),</span> <span class="s2">&quot;&#39;</span><span class="si">{}</span><span class="s2">&#39; not in </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">field_value</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
+                        <span class="n">field_value</span> <span class="o">=</span> <span class="n">field_value</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
+                    <span class="n">field_value_list</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">field_value</span><span class="p">)</span>
+            <span class="n">field_value_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">stats_to_number</span><span class="p">(</span><span class="n">s</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">field_value_list</span><span class="p">]</span>
+            <span class="k">return</span> <span class="n">field_value_list</span>
+
+        <span class="n">field_value_list</span> <span class="o">=</span> <span class="n">get_field_value_list</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span> <span class="n">field_keys</span><span class="p">)</span>
+        <span class="n">select_index</span> <span class="o">=</span> <span class="n">heapq</span><span class="o">.</span><span class="n">nsmallest</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">upper_bound</span><span class="p">),</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">dataset</span><span class="p">)),</span>
+                                       <span class="n">field_value_list</span><span class="o">.</span><span class="fm">__getitem__</span><span class="p">)</span>
+        <span class="n">sub_dataset</span> <span class="o">=</span> <span class="n">dataset</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">select_index</span><span class="p">)</span>
+
+        <span class="n">field_value_list</span> <span class="o">=</span> <span class="n">get_field_value_list</span><span class="p">(</span><span class="n">sub_dataset</span><span class="p">,</span> <span class="n">field_keys</span><span class="p">)</span>
+        <span class="n">select_index</span> <span class="o">=</span> <span class="n">heapq</span><span class="o">.</span><span class="n">nlargest</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">upper_bound</span> <span class="o">-</span> <span class="n">lower_bound</span><span class="p">),</span>
+                                      <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">sub_dataset</span><span class="p">)),</span>
+                                      <span class="n">field_value_list</span><span class="o">.</span><span class="fm">__getitem__</span><span class="p">)</span>
+
+        <span class="k">return</span> <span class="n">sub_dataset</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">select_index</span><span class="p">)</span></div></div>
+</pre></div>
+
+           </div>
+          </div>
+          <footer>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <p>&#169; Copyright 2024, Data-Juicer Team.</p>
+  </div>
+
+  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
+    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
+    provided by <a href="https://readthedocs.org">Read the Docs</a>.
+   
+
+</footer>
+        </div>
+      </div>
+    </section>
+  </div>
+  <script>
+      jQuery(function () {
+          SphinxRtdTheme.Navigation.enable(true);
+      });
+  </script> 
+
+</body>
+</html>
\ No newline at end of file
diff --git a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html b/_modules/data_juicer/ops/selector/topk_specified_field_selector.html
index df7008c7a..48cbad3a0 100644
--- a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html
+++ b/_modules/data_juicer/ops/selector/topk_specified_field_selector.html
@@ -83,21 +83,12 @@
              
   <h1>Source code for data_juicer.ops.selector.topk_specified_field_selector</h1><div class="highlight"><pre>
 <span></span><span class="kn">import</span> <span class="nn">heapq</span>
-<span class="kn">import</span> <span class="nn">sys</span>
 
 <span class="kn">from</span> <span class="nn">jsonargparse.typing</span> <span class="kn">import</span> <span class="n">ClosedUnitInterval</span><span class="p">,</span> <span class="n">PositiveInt</span>
 
-<span class="kn">from</span> <span class="nn">..base_op</span> <span class="kn">import</span> <span class="n">OPERATORS</span><span class="p">,</span> <span class="n">Selector</span>
-
+<span class="kn">from</span> <span class="nn">data_juicer.utils.common_utils</span> <span class="kn">import</span> <span class="n">stats_to_number</span>
 
-<span class="k">def</span> <span class="nf">to_number</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
-    <span class="k">try</span><span class="p">:</span>
-        <span class="k">return</span> <span class="nb">float</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
-    <span class="k">except</span> <span class="ne">Exception</span><span class="p">:</span>
-        <span class="k">if</span> <span class="n">reverse</span><span class="p">:</span>
-            <span class="k">return</span> <span class="o">-</span><span class="n">sys</span><span class="o">.</span><span class="n">maxsize</span>
-        <span class="k">else</span><span class="p">:</span>
-            <span class="k">return</span> <span class="n">sys</span><span class="o">.</span><span class="n">maxsize</span>
+<span class="kn">from</span> <span class="nn">..base_op</span> <span class="kn">import</span> <span class="n">OPERATORS</span><span class="p">,</span> <span class="n">Selector</span>
 
 
 <div class="viewcode-block" id="TopkSpecifiedFieldSelector"><a class="viewcode-back" href="../../../../data_juicer.ops.selector.html#data_juicer.ops.selector.TopkSpecifiedFieldSelector">[docs]</a><span class="nd">@OPERATORS</span><span class="o">.</span><span class="n">register_module</span><span class="p">(</span><span class="s1">&#39;topk_specified_field_selector&#39;</span><span class="p">)</span>
@@ -169,7 +160,8 @@ <h1>Source code for data_juicer.ops.selector.topk_specified_field_selector</h1><
                     <span class="k">assert</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">field_value</span><span class="o">.</span><span class="n">keys</span><span class="p">(),</span> <span class="s2">&quot;&#39;</span><span class="si">{}</span><span class="s2">&#39; not in </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
                         <span class="n">key</span><span class="p">,</span> <span class="n">field_value</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
                     <span class="n">field_value</span> <span class="o">=</span> <span class="n">field_value</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
-                <span class="n">field_value_list</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">to_number</span><span class="p">(</span><span class="n">field_value</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">reverse</span><span class="p">))</span>
+                <span class="n">field_value_list</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
+                    <span class="n">stats_to_number</span><span class="p">(</span><span class="n">field_value</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">reverse</span><span class="p">))</span>
 
         <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">reverse</span><span class="p">:</span>
             <span class="n">select_index</span> <span class="o">=</span> <span class="n">heapq</span><span class="o">.</span><span class="n">nlargest</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">select_num</span><span class="p">),</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">dataset</span><span class="p">)),</span>
diff --git a/_modules/index.html b/_modules/index.html
index 751381c6e..3b8220535 100644
--- a/_modules/index.html
+++ b/_modules/index.html
@@ -196,6 +196,8 @@ <h1>All modules for which code is available</h1>
 <li><a href="data_juicer/ops/mapper/video_tagging_from_frames_mapper.html">data_juicer.ops.mapper.video_tagging_from_frames_mapper</a></li>
 <li><a href="data_juicer/ops/mapper/whitespace_normalization_mapper.html">data_juicer.ops.mapper.whitespace_normalization_mapper</a></li>
 <li><a href="data_juicer/ops/selector/frequency_specified_field_selector.html">data_juicer.ops.selector.frequency_specified_field_selector</a></li>
+<li><a href="data_juicer/ops/selector/random_selector.html">data_juicer.ops.selector.random_selector</a></li>
+<li><a href="data_juicer/ops/selector/range_specified_field_selector.html">data_juicer.ops.selector.range_specified_field_selector</a></li>
 <li><a href="data_juicer/ops/selector/topk_specified_field_selector.html">data_juicer.ops.selector.topk_specified_field_selector</a></li>
 </ul></ul>
 
diff --git a/data_juicer.ops.filter.html b/data_juicer.ops.filter.html
index 4335f54ca..18c1a115a 100644
--- a/data_juicer.ops.filter.html
+++ b/data_juicer.ops.filter.html
@@ -1702,12 +1702,12 @@
 
 <dl class="py class">
 <dt class="sig sig-object py" id="data_juicer.ops.filter.VideoDurationFilter">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">data_juicer.ops.filter.</span></span><span class="sig-name descname"><span class="pre">VideoDurationFilter</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">min_duration</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">NonNegativeInt</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_duration</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">NonNegativeInt</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">9223372036854775807</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">any_or_all</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'any'</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/data_juicer/ops/filter/video_duration_filter.html#VideoDurationFilter"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#data_juicer.ops.filter.VideoDurationFilter" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">data_juicer.ops.filter.</span></span><span class="sig-name descname"><span class="pre">VideoDurationFilter</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">min_duration</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">NonNegativeFloat</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_duration</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">NonNegativeFloat</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">9223372036854775807</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">any_or_all</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'any'</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/data_juicer/ops/filter/video_duration_filter.html#VideoDurationFilter"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#data_juicer.ops.filter.VideoDurationFilter" title="Permalink to this definition">¶</a></dt>
 <dd><p>Bases: <a class="reference internal" href="data_juicer.ops.html#data_juicer.ops.Filter" title="data_juicer.ops.base_op.Filter"><code class="xref py py-class docutils literal notranslate"><span class="pre">Filter</span></code></a></p>
 <p>Keep data samples whose videos’ durations are within a specified range.</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="data_juicer.ops.filter.VideoDurationFilter.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">min_duration</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">NonNegativeInt</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_duration</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">NonNegativeInt</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">9223372036854775807</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">any_or_all</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'any'</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/data_juicer/ops/filter/video_duration_filter.html#VideoDurationFilter.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#data_juicer.ops.filter.VideoDurationFilter.__init__" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">min_duration</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">NonNegativeFloat</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_duration</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">NonNegativeFloat</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">9223372036854775807</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">any_or_all</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'any'</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/data_juicer/ops/filter/video_duration_filter.html#VideoDurationFilter.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#data_juicer.ops.filter.VideoDurationFilter.__init__" title="Permalink to this definition">¶</a></dt>
 <dd><p>Initialization method.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
diff --git a/data_juicer.ops.selector.html b/data_juicer.ops.selector.html
index a9b425b92..5bfc7d485 100644
--- a/data_juicer.ops.selector.html
+++ b/data_juicer.ops.selector.html
@@ -54,6 +54,8 @@
 <li class="toctree-l1"><a class="reference internal" href="data_juicer.ops.deduplicator.html">data_juicer.ops.deduplicator</a></li>
 <li class="toctree-l1 current"><a class="current reference internal" href="#">data_juicer.ops.selector</a><ul>
 <li class="toctree-l2"><a class="reference internal" href="#data_juicer.ops.selector.FrequencySpecifiedFieldSelector"><code class="docutils literal notranslate"><span class="pre">FrequencySpecifiedFieldSelector</span></code></a></li>
+<li class="toctree-l2"><a class="reference internal" href="#data_juicer.ops.selector.RandomSelector"><code class="docutils literal notranslate"><span class="pre">RandomSelector</span></code></a></li>
+<li class="toctree-l2"><a class="reference internal" href="#data_juicer.ops.selector.RangeSpecifiedFieldSelector"><code class="docutils literal notranslate"><span class="pre">RangeSpecifiedFieldSelector</span></code></a></li>
 <li class="toctree-l2"><a class="reference internal" href="#data_juicer.ops.selector.TopkSpecifiedFieldSelector"><code class="docutils literal notranslate"><span class="pre">TopkSpecifiedFieldSelector</span></code></a></li>
 </ul>
 </li>
@@ -141,6 +143,107 @@
 
 </dd></dl>
 
+<dl class="py class">
+<dt class="sig sig-object py" id="data_juicer.ops.selector.RandomSelector">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">data_juicer.ops.selector.</span></span><span class="sig-name descname"><span class="pre">RandomSelector</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">select_ratio</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">ClosedUnitInterval</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">select_num</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PositiveInt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/data_juicer/ops/selector/random_selector.html#RandomSelector"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#data_juicer.ops.selector.RandomSelector" title="Permalink to this definition">¶</a></dt>
+<dd><p>Bases: <a class="reference internal" href="data_juicer.ops.html#data_juicer.ops.Selector" title="data_juicer.ops.base_op.Selector"><code class="xref py py-class docutils literal notranslate"><span class="pre">Selector</span></code></a></p>
+<p>Selector to random select samples.</p>
+<dl class="py method">
+<dt class="sig sig-object py" id="data_juicer.ops.selector.RandomSelector.__init__">
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">select_ratio</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">ClosedUnitInterval</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">select_num</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PositiveInt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/data_juicer/ops/selector/random_selector.html#RandomSelector.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#data_juicer.ops.selector.RandomSelector.__init__" title="Permalink to this definition">¶</a></dt>
+<dd><p>Initialization method.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>select_ratio</strong> – The ratio to select. When both
+select_ratio and select_num are set, the value corresponding
+to the smaller number of samples will be applied.</p></li>
+<li><p><strong>select_num</strong> – The number of samples to select. When both
+select_ratio and select_num are set, the value corresponding
+to the smaller number of samples will be applied.</p></li>
+<li><p><strong>args</strong> – extra args</p></li>
+<li><p><strong>kwargs</strong> – extra args</p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="data_juicer.ops.selector.RandomSelector.process">
+<span class="sig-name descname"><span class="pre">process</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/data_juicer/ops/selector/random_selector.html#RandomSelector.process"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#data_juicer.ops.selector.RandomSelector.process" title="Permalink to this definition">¶</a></dt>
+<dd><p>Dataset –&gt; dataset.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>dataset</strong> – input dataset</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>selected dataset.</p>
+</dd>
+</dl>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py class">
+<dt class="sig sig-object py" id="data_juicer.ops.selector.RangeSpecifiedFieldSelector">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">data_juicer.ops.selector.</span></span><span class="sig-name descname"><span class="pre">RangeSpecifiedFieldSelector</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">field_key</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lower_percentile</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">ClosedUnitInterval</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">upper_percentile</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">ClosedUnitInterval</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lower_rank</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PositiveInt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">upper_rank</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PositiveInt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/data_juicer/ops/selector/range_specified_field_selector.html#RangeSpecifiedFieldSelector"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#data_juicer.ops.selector.RangeSpecifiedFieldSelector" title="Permalink to this definition">¶</a></dt>
+<dd><p>Bases: <a class="reference internal" href="data_juicer.ops.html#data_juicer.ops.Selector" title="data_juicer.ops.base_op.Selector"><code class="xref py py-class docutils literal notranslate"><span class="pre">Selector</span></code></a></p>
+<p>Selector to select a range of samples based on the sorted
+specified field value from smallest to largest.</p>
+<dl class="py method">
+<dt class="sig sig-object py" id="data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__">
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">field_key</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lower_percentile</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">ClosedUnitInterval</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">upper_percentile</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">ClosedUnitInterval</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lower_rank</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PositiveInt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">upper_rank</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PositiveInt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/data_juicer/ops/selector/range_specified_field_selector.html#RangeSpecifiedFieldSelector.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__" title="Permalink to this definition">¶</a></dt>
+<dd><p>Initialization method.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>field_key</strong> – Selector based on the specified value
+corresponding to the target key. The target key
+corresponding to multi-level field information need to be
+separated by ‘.’.</p></li>
+<li><p><strong>lower_percentile</strong> – The lower bound of the percentile to
+be sample, samples will be selected if their specified field
+values are greater than this lower bound. When both
+lower_percentile and lower_rank are set, the value corresponding
+to the larger number of samples will be applied.</p></li>
+<li><p><strong>upper_percentile</strong> – The upper bound of the percentile to
+be sample, samples will be selected if their specified field
+values are less or equal to the upper bound. When both
+upper_percentile and upper_rank are set, the value corresponding
+to the smaller number of samples will be applied.</p></li>
+<li><p><strong>lower_rank</strong> – The lower bound of the rank to be sample,
+samples will be selected if their specified field values are
+greater than this lower bound. When both lower_percentile and
+lower_rank are set, the value corresponding to the larger number
+of samples will be applied.</p></li>
+<li><p><strong>upper_rank</strong> – The upper bound of the rank to be sample,
+samples will be selected if their specified field values are
+less or equal to the upper bound. When both upper_percentile and
+upper_rank are set, the value corresponding to the smaller number
+of samples will be applied.</p></li>
+<li><p><strong>args</strong> – extra args</p></li>
+<li><p><strong>kwargs</strong> – extra args</p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="data_juicer.ops.selector.RangeSpecifiedFieldSelector.process">
+<span class="sig-name descname"><span class="pre">process</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/data_juicer/ops/selector/range_specified_field_selector.html#RangeSpecifiedFieldSelector.process"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#data_juicer.ops.selector.RangeSpecifiedFieldSelector.process" title="Permalink to this definition">¶</a></dt>
+<dd><p>Dataset –&gt; dataset.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>dataset</strong> – input dataset</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>selected dataset.</p>
+</dd>
+</dl>
+</dd></dl>
+
+</dd></dl>
+
 <dl class="py class">
 <dt class="sig sig-object py" id="data_juicer.ops.selector.TopkSpecifiedFieldSelector">
 <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">data_juicer.ops.selector.</span></span><span class="sig-name descname"><span class="pre">TopkSpecifiedFieldSelector</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">field_key</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">top_ratio</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">ClosedUnitInterval</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">topk</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PositiveInt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">reverse</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/data_juicer/ops/selector/topk_specified_field_selector.html#TopkSpecifiedFieldSelector"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#data_juicer.ops.selector.TopkSpecifiedFieldSelector" title="Permalink to this definition">¶</a></dt>
diff --git a/genindex.html b/genindex.html
index 365bcb868..7dc29b637 100644
--- a/genindex.html
+++ b/genindex.html
@@ -338,6 +338,10 @@ <h2 id="_">_</h2>
         <li><a href="data_juicer.ops.html#data_juicer.ops.Selector.__init__">(data_juicer.ops.Selector method)</a>
 </li>
         <li><a href="data_juicer.ops.selector.html#data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__">(data_juicer.ops.selector.FrequencySpecifiedFieldSelector method)</a>
+</li>
+        <li><a href="data_juicer.ops.selector.html#data_juicer.ops.selector.RandomSelector.__init__">(data_juicer.ops.selector.RandomSelector method)</a>
+</li>
+        <li><a href="data_juicer.ops.selector.html#data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__">(data_juicer.ops.selector.RangeSpecifiedFieldSelector method)</a>
 </li>
         <li><a href="data_juicer.ops.selector.html#data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__">(data_juicer.ops.selector.TopkSpecifiedFieldSelector method)</a>
 </li>
@@ -1058,6 +1062,10 @@ <h2 id="P">P</h2>
         <li><a href="data_juicer.ops.html#data_juicer.ops.Selector.process">(data_juicer.ops.Selector method)</a>
 </li>
         <li><a href="data_juicer.ops.selector.html#data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process">(data_juicer.ops.selector.FrequencySpecifiedFieldSelector method)</a>
+</li>
+        <li><a href="data_juicer.ops.selector.html#data_juicer.ops.selector.RandomSelector.process">(data_juicer.ops.selector.RandomSelector method)</a>
+</li>
+        <li><a href="data_juicer.ops.selector.html#data_juicer.ops.selector.RangeSpecifiedFieldSelector.process">(data_juicer.ops.selector.RangeSpecifiedFieldSelector method)</a>
 </li>
         <li><a href="data_juicer.ops.selector.html#data_juicer.ops.selector.TopkSpecifiedFieldSelector.process">(data_juicer.ops.selector.TopkSpecifiedFieldSelector method)</a>
 </li>
@@ -1073,6 +1081,10 @@ <h2 id="R">R</h2>
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="data_juicer.format.html#data_juicer.format.MixtureFormatter.random_sample">random_sample() (data_juicer.format.MixtureFormatter class method)</a>
+</li>
+      <li><a href="data_juicer.ops.selector.html#data_juicer.ops.selector.RandomSelector">RandomSelector (class in data_juicer.ops.selector)</a>
+</li>
+      <li><a href="data_juicer.ops.selector.html#data_juicer.ops.selector.RangeSpecifiedFieldSelector">RangeSpecifiedFieldSelector (class in data_juicer.ops.selector)</a>
 </li>
       <li><a href="data_juicer.ops.deduplicator.html#data_juicer.ops.deduplicator.RayBasicDeduplicator">RayBasicDeduplicator (class in data_juicer.ops.deduplicator)</a>
 </li>
@@ -1093,11 +1105,11 @@ <h2 id="R">R</h2>
       <li><a href="data_juicer.ops.mapper.html#data_juicer.ops.mapper.RemoveCommentsMapper">RemoveCommentsMapper (class in data_juicer.ops.mapper)</a>
 </li>
       <li><a href="data_juicer.ops.mapper.html#data_juicer.ops.mapper.RemoveHeaderMapper">RemoveHeaderMapper (class in data_juicer.ops.mapper)</a>
-</li>
-      <li><a href="data_juicer.ops.mapper.html#data_juicer.ops.mapper.RemoveLongWordsMapper">RemoveLongWordsMapper (class in data_juicer.ops.mapper)</a>
 </li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="data_juicer.ops.mapper.html#data_juicer.ops.mapper.RemoveLongWordsMapper">RemoveLongWordsMapper (class in data_juicer.ops.mapper)</a>
+</li>
       <li><a href="data_juicer.ops.mapper.html#data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper">RemoveNonChineseCharacterlMapper (class in data_juicer.ops.mapper)</a>
 </li>
       <li><a href="data_juicer.ops.mapper.html#data_juicer.ops.mapper.RemoveRepeatSentencesMapper">RemoveRepeatSentencesMapper (class in data_juicer.ops.mapper)</a>
diff --git a/index.html b/index.html
index a470794c8..c3b18a91c 100644
--- a/index.html
+++ b/index.html
@@ -211,6 +211,8 @@ <h2>Tutorial<a class="headerlink" href="#tutorial" title="Permalink to this head
 </li>
 <li class="toctree-l1"><a class="reference internal" href="data_juicer.ops.selector.html">data_juicer.ops.selector</a><ul>
 <li class="toctree-l2"><a class="reference internal" href="data_juicer.ops.selector.html#data_juicer.ops.selector.FrequencySpecifiedFieldSelector"><code class="docutils literal notranslate"><span class="pre">FrequencySpecifiedFieldSelector</span></code></a></li>
+<li class="toctree-l2"><a class="reference internal" href="data_juicer.ops.selector.html#data_juicer.ops.selector.RandomSelector"><code class="docutils literal notranslate"><span class="pre">RandomSelector</span></code></a></li>
+<li class="toctree-l2"><a class="reference internal" href="data_juicer.ops.selector.html#data_juicer.ops.selector.RangeSpecifiedFieldSelector"><code class="docutils literal notranslate"><span class="pre">RangeSpecifiedFieldSelector</span></code></a></li>
 <li class="toctree-l2"><a class="reference internal" href="data_juicer.ops.selector.html#data_juicer.ops.selector.TopkSpecifiedFieldSelector"><code class="docutils literal notranslate"><span class="pre">TopkSpecifiedFieldSelector</span></code></a></li>
 </ul>
 </li>
diff --git a/objects.inv b/objects.inv
index 080f7e111ddfb3f6fcf51900c910e1f78b2e1686..4650694583a6c9831f288a9bdab077f4ea654a91 100644
GIT binary patch
delta 4903
zcmV+?6WHvbDc32Gx_?t8^Ks7Mxg?$!$dtB}{Hx0nR$o-FSI08#a(3}=7wOya%R_t2
zf97X4f4<oKB%Y%M&0m|WG2VQ-zzgN_eVKR1;%nBhYXLaPY13T%y9IXv;3Xkhk+p=3
zl39BwQyBDk?mLTsoYwT2JeDu9pbuHohGKuno><*v?JFpD3V$9fbSg3^aw>3A9;>bm
z2L5}GOY<D$tb_*hRmKImD#O-nmlcdOS+UO<Nyb+?l!K#?=l;teUd7SIM&nz!8N{Tg
z*NFIhT+5ZHU<M&2cb>T4!{s7msJVosEOy!6Sgr8W)=Fi%guPT{-I6!FT~{P+Ru<BG
zA|e`~#*c*m-G7839a+6+p_*!Dv8QGZbNUOe`R7@iN|QGf>+SC1!xyOTinSOwFD&WW
za_zI6ZV!HzGZ5feE*pBR`VwxqU_I9G`m^=E@Y^qc$=(eRv~BC`aj4MN!Tb#abBBR>
zz`%sRGZPRmt3WVhYI=~SrEL=o+Lq*BO<ACDas*IC>wiBww%V}ItZ7;Lp+DSkkQE=u
z3?ukWn4{4&b+v95;UzhvMOTr8CI?1#S<V^^a1S*VF#xQ(Jd!j^Fj2ic(h37q)n&pO
zkYP1D_D!oPxFF1i<qYK-#|qW=P}5{}&BJdF{-P$BcK9E8q-}B_{VP$ee_LmODNDO5
z&k{PdJb!`#!@i^$RM=zEQ=3d}ByLPV<_4yQU^rkZf&Cfg02&&0YD=jNM1Pj^y;_2)
zK2lp+O9-ZWH3TgWs{AqegHLx#&|OOV*Y~OaNB^03MQiDXXy1}uSs!U@?1Avp);`E|
z$@}K%cKplIN)^#OU@o`6-QIru&zr5Sb+;7LS$`6F0RBhc@<BZ83<veJb5O+Nekv21
zBc68-6hYf%Uea{zXf>AhqzCL>_R@AWLp-z$1$k@*G}eP(*0f0brX26Xh9H08t$c)~
zG8F35QE(BVy6+o_E#ys-kYcEVy+;r11rq8R4l2%bP=t+hM3{+~I`tt!o%(>TQ<LV2
z?0?RP`qsq8jOuEUXgS~bOjX=6MhV5iX)G9)k;*Vy)?uthu-DLlSw=hu(bA3|!cHVI
zv`sx;kRz{|gr_i9(#sx1!O`+hqf`RFxkg*Yui$9;+rEeq$PESTGUhpmo_6+KJ^;SU
z2lIECHDv73&?lPZ^pQ6>mUL&z%ACW{pMSG<WQOw}q>%t!iF?Lz0l3RWf2h+a@PRf5
z0Gw+W%pb`%L1IO@J&%zrHkscVXzv8#H@xnH6y)iIvD(eM^Dxn3H~FoBZcZRU9!wak
zJ=l)Johv`w)6IcCoMD2TIHRog;<0aglCm-sXd45PwKz1B$e*$={B)lgSbnR0fq$p_
z%)sia;tPGgQw8sflH)YBW~zVqCC+MSx@_M!HyT#7tmt1Q!lzWqq&qU}IG@bpAb2k8
zGzvJ+j_BK+&xfpKO*M>WxAc(|>^aX0h9PPC*Rvf+Ce)GPjSx%%2k!e{6$Xljk#_qb
zfONzH;__TAz`LPLwV1vX@j~bKGJm`H!}J#Ww&hf&RTO>|2a3$QJ%}Rosm}tq?y_Qd
zlphc?4O8^c={SEx1a-WT1G&wPKPZqIrs#pxasG%v>Ubjs^5c>2S;X)u_|bx<GQ|v)
z$`&a=_w@B0ZL%a{jC2&2H+Nqi{z-nl-u_B{zy9>wjco)O$s6nS(2^xIPk*|cMtVo|
zl-?r{D8_g%<_~Cl8CYVzCi)gw>^CKLL9C;Q>7u4Mj&0%ZS<1?|7Ah1mEmX<kS{U2E
zA7`G%xrl86l`T?{dm6DJ3wppCALivfY+t2E-X|f;yMZ|2IvQX74R4B*<-H%AejSZN
ze>Rl9#J8m%E>#702(0NVd4C+5Dr7-xH(aL{-$SbY%X2n?AcEI?e}iVtJ78~TK-8{!
z6P|_=lq{&vAyQR*hg6Nss2Y6)3d!fD!Bu{Vb69n{>^iRwwCLZsI!oNMwj%Z-P`|o=
zJ-mNEYdwcgHSdewrCIr*tH-f?ob4(q)^`m)-gpfO+WF6Sx7*M6SbyI|OuXwe^f;c+
zDA0~?v+oS+_ZuSKZ2>!u*8&03>0!D4o_)#+_K6ky_JDI-kBoKRz=4l|fe09h_Xk?j
zr2p%lRjkdRrMwXv8VxfGJRW8$U_@?ny5F<(YgwnTNGO=l2&m|A=T!=z<73z0nl<gA
z4{FlDT-V}4o!2wK-GA59z$4I9ENSnD<`g&tGH`?#C=`)`pzxrha7z;wAqEmGJ_0g9
z9RFn&wELg8w_mZwzaip17qH{lEfAoc-X3VhuvQC@c&|n9IA)6saJP}YDhV-y*AaVF
zG6CA@VKL3!ur)<i?6Hok$av>99C0F`A&L=+ZFbCZT4(JmJ`ff%cARJ^Eb-!@P@yC8
zmG+am2OUH%A~5^4xKRJ~4DbNxX@C*9qc7R9J4P6BGay5wVB&zs!9)a%#1~dqIeS4?
z2{I(qdj%YBy25~T`%u${{Zf}*k@i)S8weT%m@01ElQ{?^88n9<B?u~doLFdh;)O&*
zhKx*GR>O<~uw;{V2p>Rw>+1wMwJSqHT~@&1PAd#xw{MEJZ_R#S6)RF!B(G54_2BRb
z7-*uz!oUTN3b^AdL&75vx2vl#Al)v%v7)b&)d)m?Y?p%p9}x>zoak5x(UH0<ht~i*
z1)~SX$3>-x6AqOJ9R+BPkA#T#J!X!NOn`R!?@4`KGa8|4R*~_}YdGRWKtnWa$<`lA
zEx-8zU_r<Go;79O;adSRE<V-@L!3w}H0TIC(EWbMsHw{%uh+1FFu>#EVIqnX784sf
zI{5s57ac4<8qxD#v@AgPhhg&w3v6_-Q1`V2aQC$=VD}s3Xw)1K?syY%G)lq;x=e+2
z$i!pK_QuZSzY5%}Xjs5V(y8}Ld{#8C6s3W5s49*-(iAmuFZK<PGy`-9YO?Q)xE8_&
zXc6GdO89z5GH#)pk7Luscg5&RX$DxM#N?EJhtW5WPbC)dUrV`8r5V_$bH^9Vo%U6B
zE-hRqOa(Lw;=6v(^I$CTTYR@pmmbumZ5*Q{<5x^P7p8^CQ&1iHcEi@ysygF>tpIH@
zrbAiL?4F5h8B>590nVYM8P>26=Q^JNEh3&vj8LQ+SHXlx@hDQclJQqtj?;1n;=5db
zBtV)5)<<G<9swo(D=7kHD!>k*&!=tz@mg9Cpiag#3d^VLagwE^8AdLNU*7#XZF*3n
zxFb)g)>Z1xr1W(HO|V9Z)j0u~IFq#3NmB64TE^^KvOlHY_U()z|CTJtXI`n%0&SGr
z@bpx2ZWo;^c4vO?I+Z3^yTTz^uouRE@|1Wby#{ELF+GI6^_p}7ZF!-AIyKN{xoug^
zT9d}~$d#fbIJ1y3O^#So2UkiEv5fp<O+^#2<!{cgTPH-pvx{M4`V>je`BP*;F4}s9
z24uU|c@1Dr%UUnXugq_Uw4C9~BaWP<l|trdMj3qTKDArq$n9AtN)PJNrs)=cx$<5@
z<oS1trfm<U@f~Y6NPsZk@g1&eH;&_*TD9JJ69IZH@DA<AaSD9il62({1PE189s0!X
zrg26+K#)&&lPf8cn$Vh2&DUX@dk)a1fpuuC&vSmnZ>}^zodW96*7Auj{<~k(C$nfe
zF+i&c+aX!${@G{N0t9+@|Atn7nPxcBKIybhm=@D2tP$&2(@e~IF^#Q`9e?<D`Y~3^
zuZ0<BHjd{X{xj}j%P(>6VLM$mj)dRd9xJy0S`UX(_b+CO3@f1hW~V|)eCClnIo==o
zetW~-m&FsSTbMsOSg1Ez0;DfFqs9BOdabkl0qBbc7VC?S0PV}ok!Cr6))N5|?}rGE
z_Ttz6BCHi}h<GOiY_t&{E36GCJiG~KWV8jJvf^8VbwPl{8zF)Ny|{U)XwlbINnOIs
zP%xoxsOV5T+F|hYAJ;%LW{6-b=GZ_ZZnKx)iYzHp26IA&gj%711HBlhoZjVK4YNW*
zggPN(qm6h^tD(0rbr63&`SS4B-?6?J3Gg<Vsp7d7%dGewMY85Yw%c`0Pl0ttg2fvn
z6QF$=Qcdus&Kwo*ii8g|<#S%qZhwP&O_NUz7=Mgeb>pnV=dwQ1Jo`TW1LlgI0P2#1
z3h3HBJC;xEUG|t~<-VrX;T7hW2^#8@kp%A1`|>#afLjChXAT>|8;Jn!%dg|gH?S)r
zXarAY@Nh>ymBoG>ZU>tp;YRR9W{7RgWpgOIJSE>4t4Q9T8&j}xbf3%OG}3%Zfy%)_
zF@IA;h>wK~9;#vHiniOn=wL;U47N>+9>Fm^58SLNNh}${g0@P79l<Ca3*4p$`j}7Y
z&QOaa+z9r_3~+14Qt@ln9zGUHS=TISx7}mY)_o@%{X)5^A{gjF`v2hRtO6QS1D^ZK
z`-h*dC(Zc=jI1i}JHzkGp*TW*1S%{zgnt?WV7N}@^R~(}oGltus4F^rpebWs%X~O;
zOeui*V<&)G<)Dh|8qZjYX`9Fp+qW54T;sOg@ks0JJFt7_!4lg*g)gp&JZ&?ge*!^l
z>qIKFYjHBYOwf3zB4>JW4)plH1~q~+(Z>I$Jle6XcS-=odZPu$TcoGKJ2kAb6@L~D
zpuN(<Uv3}nuODuH`RBkx+rBmZ*tIMYSA@vuiQ>fOEhjuiXk_$QZYexyv&I(pyVuhy
z6RcNSaJ*G|njhlSWiuQlV~&199$(MPg3z>oW3OX#HA;Zy;L!pk(F~jYar?fpSH27*
zGGnAyVw<ocPGAJ&Xi*V4=o8JeVSjv%q^#|W>9TH+RgH!b9TH)AO@uDK9vvSV15ZS=
zV)X0{@KNoG)u;b@$XeD^efv93N`w_3>X@AgXc^z&YquLZbL5%#=7{%-q>Y_-=}IS~
z%7o7Xl7ugyAzsUY)``<%nuQIC>AkFJ1q<<7kF!#k4$~ow4NFOgb1_VSAb%0hA!fiL
zuEi(;QUo}MlB^>3wIBg1&R`A=`LLRSIM-tYC=v0DV%WHrpx2SqC2_BaSSLpXwCEWV
z`pF-ZxE3U=)iQ@ODjD_>>3BjtaV`m1r$xatis9Q!J?33aTq*hMq{wg%CCPc5l&{a;
zSM6V)$igJ~0HtPhf8l<~=@}4_e%jJgIyLc14i8W!V>*P5y9+8WpJ{ZHdJrlPY%Y?8
zxx(vYI?#=>>v0g1vk*Feb8q6jU+>~9$`HvGe3$xqOz$JjDZh^_Nns4ptcP~Vop!NH
zMyDmQ))c}JsXAbn&~_Y7n|4C=N#F|+D1*Ah@lVN^s6|%i)rCBS2an~s#`LP8H$UW8
z5wC2qwPH1xMq#J#Y8vtO$<|sYO9yFG#rGDHTikr$W?}BsI<*c3^o7Ewt&?#P6Mq>L
ziiOWH?d7&Ci3g@$M0W^RdLGuikN}0gJ&#G2cD&S`*ApUe$$05$cQ;;toOdN#kVX=q
zRFCY`I_`)*+fZ^*E=Q1HJ+_}@VGq3}2@b(}6=;WCUZ)AO-(*3GL4aC4vai_ltTS0K
z-yW`3LiSbL43nnBclJ44rU>iMIDgwtQEfc9rep)O8lacx)wCI$)1nx~eyxS)^K_kl
z0LF7-a-C#5xT?8f`It3e-MjE~k#qOL^M$uH`=cASdY<oez7^O^)30-~5Ej2lSRkLz
zRO|w}ds?J@^A79T929Pm1dn%Q&!WCk7KRvJ8M(m5J(ShAsJ@vA;I>()z<;joPTxC`
ze;Z<XLAP-!SzVS|9-tn-M&c|{){B!6?OOKCY^2wx9i-C;ZxP-0@0wY&&sVixvITvv
zxGl@PF;?K?S!>PR_Zuwtk7sT7Uo>rcm6nP8sKYH16D?@|+GLICmeYCLy;nxSoIbML
z3VC`0ErjmiOhHl>yKHaL&VSG9l=CxJ1}Lp+0ZgZPx?Mda@z%_&wq|;*HM`P+7<5iB
zdX@dH@_`xKZfK*nCXM{8P9;BcWq{Hu6~J^_r`y#-a@k-Kd)gmdQ<@C(g=Jy8kzZlI
z|H=ZZePRh%_tXN6{;;AYWog%UHxoJ(LzX8JuQNTDc+HiQ$oPy3m4E3;XOOm&hHdI?
z&nxUgZ`|ho$^vVAI&ph@!*fkA`opHZ>MhSAbTf^;+$#&L@mVx`;2U7{ht0iL%hSwi
zTT5MkWd%1rleTYcgfg9wRbA;y9is*dwS29n;b8FYt=*t&Zd~bcv=Y@8h<dYfMAJes
z2z_gy=-M?`dK|4#wSTFj-mDxEDX-*ak(S0lAYSv+o)uY<vX?o^=_r5NkHeooy<IkW
zBV6&7<$XEMF4}Sg-*E8%{qO(jpQB8=A-7~49{LkUx$6JhU(j6ijrxmm8eB5#4f^Nl
z(oKKqX1esFaQ8T`-Au4;*T3Q!&5dB^3m&FFrW>EN`u`%UH(26M_ix&ABiz4fHnaN|
z*wjZ&6PD_Ab2`C&zvB&Q9v~j++3PzU;uniypH7(HBk|6k1+g;}L61Y|m)E|RT+H=0
ZjMm9vej`7|@7eP6U+A9w^nce$doLJfhZz6>

delta 4815
zcmV;=5-{!8DWNHlx_{H{N|NUso(tl60j0F1<X>Huu==8Uy*id@m$Qq1yGY-DzdW?Z
z{AYe;^XH4rPvSLN(EPQ@8so#K8@y01-<NrJEWTz9yB2_xoHotHzgut@0A3Q36<JHj
zD4Df~GKE2p*S@m|$Z1WV$z%Bv3;K{XZ7BAK?1|M)*1m#br+?tVLZ>2wBBuf;<+1A8
zVBo*^xHQi}&Pr%7Uu9g7t1@iOc3HtllNI}%kz_2=p&T5AJojG)@v4qCHX7f`%^)T{
z6(i#FaV=M(f*FLA+<D?Ihs#CCP;&`MS?sdCv034lt&Pfb3wx=`x+QOTyRJyutSqE*
zA|e`~#*c*myMGBoI<k7tLN(RQVo%K+=JYpQ^Ut$3)h2H!*2mq|hc8gw6>BkWURct#
z<=S^SJs$inXCT14TsHJr^)=jZ!+NZt__OuC@W(HI$=(eRv~BC`acI!i!Tb#abBBR>
zz`%r;nF)y3RUjBLbv;Pa(zXc(ZA<d6rYul6IRdDn^?x57TW#29*0e1B(Er?UkQE=u
z3?sNC%+Y9?x_UQ@@RFR-qN_+klLI5WEN2Y{xQCjm7ywpX9!Z)dn5bSJX@vo*>M~&s
z$gr9n`>xd#To7i%a)xq^V}<H_sA;mg<>3zpe^C=mJNzGcq-}B_eUYfvm)03z%F?dN
zvxH7PkAGmmurFx_7519+)F)Fri5nA;xr3=87=AF-!2S$#01XX0^`+DfqCd;|SuMfT
z9;rR8B?Qy68iJMwRsNX#!KWuB=q{yw@qOz5qyNmiqO}Y|w54QM)<@bJM<D#Nbqq4y
z^1gYx9sgzNrHW`CFqhlkZf`&S=grpEyIYFstbd6-0RN-!`5<0)hJ$+BIVj?FKa~m1
z5$`((ilFT>FKIdsv>Ho)(gXG`duh9xAzoUBg1oi@8tcU`Yg(jzSB{TiLy*7lUOvK7
z84C63D7c7F-S?fu7E+QVq!{X8<>-OEKteskLB)9vim-8x2s07WranYyQy<W6YSKKB
z-G3QT-<#N&QC&R}E$17bsfk<0D4{qwjTOT(QW-|eI?UAw_8K}c%ZTS7TH5hL*oj1j
zzNtq6Ir5fCcnfnQz3fF494-GeOC|7|TeM~T3XYb)?W-7p+)%+TW1fTPX=mT%1K_)S
zFn^a>L&gyeeWF=TA9;sk$#ABu%sCAGHGk_sX1M-AnhDU=xMv&}fV*DwhdRvyA82y`
zz`3Tu{E=)6BvzE$^Bl=yllh~8_D&#v)9XG+L7q++tKGaiPZKS6lRp~h<^&St!Gy8e
zgY7uox$?t3Jsjx6879byGs<c&9{av0DJw&TwlN@Ct3xx1{3-jwPtTcw<x=enJbyiB
z23CuTFZB6I6}+!Xj?>bbsr})%IGdsAwte5-Xjsv*qAyB>FR7GCcVyOiKAFcs@Lbku
z6mXs$(f2!_4_V8aYM9M#=_4uFbDk9pL(=rcvmHn#)REzh5KICG?#Evh28xG~cKacK
zbi@MU@?0%I*-(~ROkaw4qw{;2-GBUHD#gBiIhAP@g<r#gBJ=JKq6mH3vjDEUtQcP9
z2gFRn6g_l0&L0s$9dG17ZnNVL3Z#Z9dLVV2KO&Gi-bjJ`c%*w4F?<Ssw4kX>F+-)Y
zMGDY8eSJrpEQuH+9mVC%-Is@dl3%a4zmngtKmB%Nn?XkM#ws3KvV`VImw(epWkgTu
zJp+Maj8`y!K;O&260?}-dtk9kO6-bQM-kISO?4dG!r!x$m2oXpC}LWulEt+!wtqj)
zJdJY^`vNLkq$>9`V?!2<fHywO%SYJ0N{@U@LY5B$al&;pzWN*96(`F_KREq58i)RD
zsC|iVPd{9$3hoeC(^v91bbnRIg5GYpPA$HNRQ;Fd>;gdqulfE4&6+Y`?`J^Nt||#n
zLk&t6wC511D!xOiMrKrvJ_3d0bJOH1zr{JNI^A}iR|i`3C9cjA_o}Uky$IB=?OzY?
zKh9du;nU3fqIYRle(36PE+1#Rii-7JgO4{}LxOhx^WE+C^F7vg5q}f!`V2je=Q9el
z<J;^z!}|S(h<97Sj^njJfOLA;uD@rWvVwhL#lAh@9M>aboi}jcBVZr`M&kW})->t=
zb<Zl+X3$#Rhz*T~nFSsXGZio*w>jPKS^BlCQ&=PvOlSmDbhz^>1<>(vXmHJ%_Rt44
zX<)8vaiPxZ8Q|{gX@B4mXeySp_d|CI90D0QLJSm&NI_6|&{4Rh35yT|2^Jp#nIMk;
zG7H-M&)eItSmWOi@tzCVaqJcd&`xg;v|?DR1xUQtB6u9LMFzOr$WfJq7{Tj^qbivI
z?ewsj=59EeA}jV-$5mv!^BRsg5zr9Dh{QHKW;w01_7xus84f#6G!&M2@ldGH5&25{
zle`BV7%3t!`?a`G|Md*;0O)Ce5xAo-*|9rDn3M4b6c{!MG9=V{1sra=!hm%9P}7F}
zQkPwk_Dz!=2pR;ODsJABI|w8Kw1$&m2onT~1F&Y3cnBXBW9#b#y0j}pLS0tC;Z7?I
zV7G6Iw(paz2pAH0;44GIBM^6}t1uwlF2Av&Z<E&uM1N|Rg8?5A3s;=zSP0RPx+{lb
zfL(&o1LNbOQp5>|%7cyqG{;9m#QPpI$44eWJN@^hzOEUK&@`*ac;_`7aU!508jfV^
ze@ZQX^8>(wj`uxl%Dlt(0%Tl#tQCehkydEX5qO~c{g6>pmq*^NVFO`+$H&7&6elbu
zHgt6G`F}4uSbQ|1=f7xKfbI{|<`Fj7=wPAlYYE`)YgxeVH^|wjIUwBeCgN<Agb#F?
z3j2_W$DZwtoymU}xLMJ#fsv%s=$H8HXkIBw1L;sz9CxHCYU1AP8z5;0=n&Lo-x+aj
zgbmOlz?qfs{f=ZjLN_1hrit&4(UsB+uttf=DSr>6?;f9OEaJbHa-B*uuu<oZZ<srs
ztL)rbxK5Y~XcWXx{h;T`SmL+&Zk;YYs7u>8M@h!7n0RhX3z4UwI`r*^qpMYQ#tmBm
z+GI?JvZC2N6W2DT067AjLrF92VIj_aJ^@-pJeL@uN;U3+36bJarE(?XUu`)~+Z~AS
zc7KroX&P7`iOqQil=$zY2$ZP+JA^);h6%)LYe9fI8Pg~%pR&hEmXc<exg>sh_v^Ij
zL5<>$Jf&LKs5_I=*9kPi8YNce1Z3h&(q1P?!82<avv0}%lz!XyGlu+IvLv5*rA7<1
zQEtQ2Q^~nsbZ*$4`Mv8@nqch;hh)KC7=O!C;+6CopiRc~5cW1|(h0QXg$C->K%3>Z
zWi{(f8q*_Jijv^WLdG;XVoe=fDM7?C@{cVQO~jtRIm2$95CzXJhK=b{Bt7R(kp;PE
z>lGT1?ONw8fH^H|y)3^vza7$YhA)pea+X#KnWGtH@T2?GVUZ)ZXPqcLs7sq>SbyZo
zdkK-}KP;M#J(T8mtl1y|!hGj<xT@Vak8kSLdgo09=(WH*v>(SM@Oe+tl|K+5R7G{@
z6Nj6|74-l?KEq9}q)ci;YeqF+hi&aSK$`~Ep|L)%`4PXh(g1Y|s6$)J7ryx4{hB_R
zMazi+T20sv$;$B0zOoh|(0lkd^nc1U(~<T?r**=#m{wtp*v6VxV%CdkY;ElLhkvI(
z#%lR%VaAn><M|K&8PBlgw>ZzRoo*Xv!f$Vn72AKUho4gSU(6I4RzUl$PKA>AtRs1H
zyg&5g_J+MLizimMFn@HgP;ayZNMCYBi}z*qT4(zM&=(CX))yTC+LxOn&3|&NCjuni
z4-p*g#jpKMSS#KT@lFWXXd^yWSQ}1wcoWXZXbV1N#kU6Qf&hs(LIekTar08qqHn8`
zx`dgbU_#wc(V=#<!{q5du7PIE5W!Z=v4KY1W-q@LSyHA9=7bChwL$?0dND3Jz012A
zW`%?Zbwb8Q8}Xi2!)Rd|AU^)`<>9ZtV|_6a;B7Kf#d9r|S@AuJWX*?cx9ggo0_%(f
zi#J9lK>ISJn&4}lIV#>22_I<6=e(lb{svb}lTZy9f6Q5R<Eq2wvOdy0`#$~$%oRHU
z)FlTM(6xJZET7oB>@m;EeNC&wE6gtwG}I|03EZRi<#G4}ZVlL<Icx-PBm%fEzm7ZK
zz^;g(5j>f}!yWll7W;9!9c+q(8^ITuA+|M_&7th_lzd~XB6)vpOu@#{eJ+dBO!FxP
zDhCI}e@qb}J{B@~sD_m*+HU)*gB3k8*fuSC1jqC|aI>Z)v1AAf+A0lp1fz5;aGM_J
zV?L!jLoJeUBiJJ|z^xf;#jja=_*f)mU9+U!c8^V4_k(P7fpSwtFwlea|H0E)12m=v
zJomTvFF)N+n)3q~SykQ-hToS%b%gu~R9J8be>DWaaGmPsZIx#@TQsOpS9JJ5Q^vfO
z`E=x%QULSEP5`yaK^50Ep0O0uHjyE=Z!@mA#%;Ufk=EIF;PB3aCANVIUtAM;+Ga%m
z1cKPsiBxFU;$(W6pz%&c&h+9O==pyQY6NGZ&Hqn%v}0fIlmLqLMhlL&NKb=zYS?8f
ze{32+d!>cH+&<i2KivHC&w+=weQ)})Ygr_&2$9hf#fi;ZPI!#a$mp@$Qh3m2jXmyn
zucuumSg*9;c&qd@Kg6laX84tiIr<HGd_8XqLeu_@y^h`0C;^&-M+=ZdGaUBE?fb@F
z`8JHmjFF1OHep4azzE3Eq9St8Cz@x&fBYOtS=(3BW!)f~8Vw^lB*ONZ2t#~5IzBW8
zo``0}7}*=(quLc~O#k(ewXCW7{&$>|2rE9+F*_5`GJe3<ZZ{0($TJ_!5$_#I8$0jP
zolZuT310;y3Ex0NytV_a6Q{*A3mX#Cds)*C7UH!ZXQeP5rb8GTwvrI%W|#m$e<Gek
z%z#B)n^6L!2yhN1Sw-x7K>}2q!5kX$VK)PD?#BpFBH|gvuyHRzuOq2T;@%OlPL2v_
z(K9CWlRqeNZAe(FWe#UlGMppQ@q~Kf+!C-(i-Knq!;hDG%)6SnQu5bHk>MOllJh(%
z-=Dp&+P^-Lg-P-OO3mp0!u^ud86hJ5w56wXYT}g~9-vIdbO;+y7gP$LX?Bx)5GoJs
zE|P`0!s}!@(2cU|aT1fX5ITSJXySZc@8T@V5XlyNm->24?<37AzmF_QVGPl%hjz)G
zPO(cyrzNr06v7axI$)R3cAQR|PD1rb;0qBbgSy1=Psy05MONq4g}j6ZkLA6_^r~Sr
zKjc>tuWYfkVl|jXVW;nEn(_9@)><b^2WeEr&lZw<+<f9@VeZsAwGMyug~F$!oR;P<
z!KBaN7iw?Lxin-=8WHRr6pDrKF`ebMEQtrEUPN~YS4JMzypRBez9WxGmUh0>o!1j0
zaLIh>>2x>Vew<GwTaZQ)pj40S)H<GsK08oyQ7%W2U_G{<WZ?|GB?%6}dKGAgT;8S$
ztKVcnia~%{J+iOZ^QwO{S+L$7u2w?!Roe`Uro?ykIb5a)>(DqmPEqYVxRzuCv>Kq7
z=+*QYoXesZ#eTho=ks!%egejGVRD^hJGiR3VfmOfU_HC=bd&S+!t;%{HT$C*j(VP-
zbiNhXOv|rxvJe)(MOYx8&r<9Hx`(p*7S%s90o*zZ71*`iDM)`rKG1)8>D;)Kto{Hj
zFVOZ@nYH^AiuJNB=ySzwS>}zg(ipE=Ym2_GVtIbNYJ2{oY16B;OypM`9+8-6LG#xp
zYfO)vuG^lyG6LrGk>ys%(;H|ZbU)4%BxSM7_9pH8s!ln-a%F(hsusX>ny1IrpCsOz
znbp=zueD}ZS`dGO&Iv}ZvVT<mVaB!_+QrtSkzduR<X5f?P+Fw|m`>~TxcZY^H<-kp
z&Lq~<Cc_#<S=b)rci5l5vcPKJSOV6)wE(04S<#ZRwClUL2_32-%NvRJnchph=gLWB
zd`E@K^rkaN+eO3C*tYi-PQErCt$k&IHNKsAq_*L`CK!MH&!+RSE$<?9ZH+zaD+{dg
zT{P!z8({Q5n|sWbx0$uJmbU)N3T}KSZGYSdWx61%y3*A;MhzBf`Cd)O!QkCnzd_gC
zxYFZjC8|9T^<m`~O%KH&^sR%U>(^ZAakN6!u8#V!@{34$C3lOoGzJ3ko}c!t$cmJ`
z%u!Ci^0$BeIQ;Xcx7#Lfggd^nysyXEMO%*G8xH=z|NTGxdz48xtY?oy(m!#OtNwrc
z8=4DRq!;7Tn`G7-^v~0+oBr0#bn8dq>2WQsnPA(lFX9=^jbP^+9;Sax4?b)4|3y}B
z#GRhswB<&4e$#Ab&o8j4kD4Yd)$8VTf%|^PJJLHmKs?g3;yeAsZx+KoT`-p;@yVYB
pu`?7wk3;B}*M5*!%=I>m*2!W1Apea&v*p*n&^`O<{{SDSC|u)CYz_bb

diff --git a/searchindex.js b/searchindex.js
index 9fa459b39..030a6872e 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"cuda_device_count": [0, 14], "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "is_cuda_avail": [0, 14], "class": [1, 3, 4, 5, 7, 8, 9, 10], "columnwiseanalysi": [1, 3, 13], "dataset": [1, 3, 4, 5, 7, 8, 9, 10], "output_path": 1, "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "save_stats_in_one_fil": 1, "true": [1, 2, 3, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 7, 8, 9, 10], "object": [1, 2, 3, 8], "appli": [1, 3, 7, 9, 10], "each": [1, 3, 5, 7, 9], "column": [1, 3, 9], "stat": [1, 3, 5, 7, 8], "respect": [1, 9], "__init__": [1, 3, 4, 5, 7, 8, 9, 10], "initi": [1, 2, 3, 4, 7, 8, 9, 10], "method": [1, 3, 4, 6, 7, 8, 9, 10], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "analyz": [1, 2, 3, 13], "path": [1, 2, 3, 4, 7], "store": [1, 3, 4, 5, 7, 8, 9], "result": [1, 3, 8], "option": [1, 3, 4], "precomput": 1, "overal": 1, "whether": [1, 2, 3, 4, 5, 6, 7, 8, 9], "save": [1, 2, 3], "all": [1, 3, 6, 8, 9], "figur": [1, 3, 9], "one": [1, 2, 6, 7, 8, 9], "imag": [1, 5, 7, 8, 9], "file": [1, 2, 3, 4, 5, 8, 9], "show_percentil": 1, "fals": [1, 2, 3, 4, 5, 6, 7, 8, 9], "show": [1, 3, 9], "skip_export": [1, 3], "draw": 1, "percentil": 1, "line": [1, 2, 8, 9], "sub": [1, 6, 7], "If": [1, 3, 7, 8, 9], "": [1, 3, 7, 8, 9], "sever": [1, 9], "red": 1, "indic": [1, 9], "quantil": 1, "distribut": [1, 3, 9], "singl": [1, 3, 9], "window": [1, 7], "after": [1, 3, 6, 7, 8, 9], "disk": [1, 3], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "draw_hist": 1, "ax": 1, "data": [1, 4, 5, 8, 9], "save_path": 1, "histogram": 1, "includ": [1, 7, 8, 9], "inform": [1, 5, 7, 8, 10], "draw_box": 1, "box": [1, 9], "plot": 1, "diversityanalysi": [1, 13], "lang_or_model": 1, "en": [1, 8, 9], "divers": [1, 9], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "get": [1, 6], "an": [1, 3, 4, 5, 7, 8, 9], "param": [1, 2, 4, 6, 7, 9], "model": [1, 6, 7, 8, 9, 13], "specif": [1, 3, 5, 7, 8, 9], "languag": [1, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 13], "load": [1, 3, 4, 5, 9], "comput": [1, 3, 5, 6, 7, 8], "column_nam": 1, "text": [1, 4, 5, 7, 8, 9], "lexic": 1, "tree": [1, 8], "name": [1, 3, 4, 5, 8, 9], "postproc_func": 1, "function": [1, 6, 7], "get_divers": 1, "postproc_kwarg": 1, "whole": [1, 8], "In": [1, 3], "default": [1, 2, 3, 4, 7, 8, 9], "argument": [1, 3, 5, 8, 9], "overallanalysi": [1, 3, 13], "mean": [1, 3, 9], "std": 1, "etc": [1, 3, 4], "refine_single_column": 1, "col": 1, "num_proc": [1, 3, 4], "1": [1, 3, 4, 8, 9], "describ": 1, "panda": 1, "number": [1, 3, 4, 5, 7, 8, 9, 10], "process": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13], "export": [1, 3, 4, 5, 13], "init_config": [2, 13], "arg": [2, 3, 4, 5, 7, 8, 9, 10], "jsonargpars": 2, "parser": 2, "pars": [2, 9], "from": [2, 3, 4, 5, 6, 7, 8, 9], "posix": 2, "style": 2, "command": [2, 4, 9], "yaml": 2, "json": [2, 3, 4, 8], "jsonnet": 2, "superset": 2, "environ": 2, "variabl": [2, 5], "hard": 2, "code": [2, 9], "list": [2, 3, 4, 5, 6, 8, 9], "e": [2, 3, 4, 8, 9], "g": [2, 3, 4, 9], "conifg": 2, "cfg": [2, 3, 4], "defaut": 2, "global": [2, 4, 9], "executor": [2, 3, 13], "export_config": [2, 13], "format": [2, 3, 8, 9, 13], "skip_non": 2, "skip_check": 2, "overwrit": [2, 9], "multifil": 2, "some": [2, 9], "ar": [2, 3, 6, 7, 8, 9, 10], "namespac": 2, "type": [2, 4, 9], "json_ind": 2, "parser_mod": 2, "exclud": 2, "entri": 2, "whose": [2, 8, 9], "valu": [2, 5, 7, 8, 9, 10], "i": [2, 3, 4, 5, 6, 7, 8, 9], "skip": 2, "check": 2, "exist": 2, "multipl": [2, 3, 4, 6, 7, 8], "__path__": 2, "meta": [2, 4], "merge_config": [2, 13], "ori_cfg": 2, "new_cfg": 2, "dict": [2, 3, 9], "merg": [2, 4, 6, 8], "configur": 2, "origin": [2, 3, 8, 9], "expect": [2, 9], "cfg_after_merg": 2, "thi": [3, 5, 6, 7, 8, 9, 10], "It": [3, 7, 8, 9], "filter": [3, 5, 7, 9, 13], "op": [3, 13], "config": [3, 5, 13], "analysi": [3, 13], "gener": [3, 9], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "input": [3, 5, 7, 8, 9, 10], "better": [3, 8], "run": [3, 5, 8, 9], "load_data_np": 3, "pipelin": 3, "worker": 3, "when": [3, 4, 5, 7, 8, 9, 10], "nesteddataset": [3, 13], "karg": 3, "djdataset": 3, "enhanc": 3, "huggingfac": [3, 4, 8, 9], "usabl": 3, "effici": 3, "oper": 3, "checkpoint": 3, "tracer": [3, 5, 7, 13], "map": [3, 9], "overrid": 3, "func": 3, "which": [3, 5, 7, 8, 9], "call": 3, "most": [3, 9], "common": [3, 13], "can": [3, 8, 9], "access": 3, "nest": 3, "manner": 3, "select": [3, 4, 5, 8, 10], "classmethod": [3, 4], "from_dict": 3, "from_xx": 3, "constructor": 3, "construct": 3, "add_column": 3, "add": [3, 4], "select_column": 3, "remove_column": 3, "remov": [3, 5, 6, 8, 9], "cleanup_cache_fil": 3, "clear": 3, "raw": 3, "compress": 3, "cach": [3, 8], "unifi": [3, 4], "order": [3, 10], "sample_data": 3, "dataset_to_sampl": 3, "sample_ratio": 3, "float": [3, 8, 9], "0": [3, 4, 5, 7, 8, 9], "sample_algo": 3, "str": [3, 4, 6, 7, 8, 9, 10], "uniform": [3, 8, 9], "kwarg": [3, 4, 5, 7, 8, 9, 10], "subset": [3, 4], "given": [3, 8, 9], "formatt": [3, 4], "link": [3, 9], "The": [3, 4, 5, 8, 9, 10], "ratio": [3, 4, 6, 8, 9, 10], "size": [3, 6, 7, 8, 9], "algorithm": [3, 7, 9], "frequency_specified_field_selector": 3, "topk_specified_field_selector": 3, "A": [3, 5, 7, 9], "export_path": 3, "export_shard_s": 3, "export_in_parallel": 3, "export_d": 3, "keep_stats_in_res_d": 3, "keep_hashes_in_res_d": 3, "export_stat": 3, "kib": 3, "1024": 3, "mib": 3, "1048576": 3, "gib": 3, "1073741824": 3, "tib": 3, "1099511627776": 3, "shard": 3, "content": [3, 9], "keep": [3, 5, 7, 8, 9], "hash": [3, 5, 7], "export_compute_stat": 3, "statu": 3, "static": 3, "to_jsonl": 3, "jsonl": [3, 4], "target": [3, 8, 10], "extra": [3, 4, 7, 8, 9, 10], "to_json": 3, "to_parquet": 3, "parquet": [3, 4], "work_dir": 3, "show_num": [3, 5, 7], "10": [3, 8, 9], "trace": [3, 5, 7], "chang": [3, 9], "befor": [3, 8], "comparison": 3, "work": [3, 8, 9], "directori": [3, 4, 8], "maximum": [3, 8, 9], "trace_mapp": 3, "op_nam": 3, "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": 3, "mapper": [3, 5, 13], "mainli": 3, "differ": [3, 4, 6, 7, 8, 9], "pair": [3, 5, 7, 9], "due": 3, "modif": 3, "trace_batch_mapp": 3, "batchmapp": 3, "new": [3, 4, 9], "augment": [3, 6, 8, 9], "trace_filt": 3, "trace_dedupl": 3, "dup_pair": 3, "dedupl": [3, 5, 9, 13], "duplic": [3, 5, 7], "extract": [3, 8, 9], "other": [3, 8, 9], "two": [3, 7, 8], "embed": 3, "independ": [3, 8, 9], "obtain": [3, 6], "load_formatt": [4, 13], "dataset_path": 4, "suffix": [4, 8], "add_suffix": 4, "baseformatt": 4, "mixtur": 4, "weight": [4, 7, 9], "accord": [4, 5, 8, 9], "kei": [4, 5, 8, 9, 10], "field": [4, 5, 7, 8, 9, 10], "specifi": [4, 6, 8, 9, 10], "info": [4, 5], "jsonformatt": [4, 13], "localformatt": [4, 13], "zst": 4, "tupl": [4, 8], "local": 4, "packag": 4, "modul": [4, 13], "csv": 4, "load_dataset": 4, "int": [4, 8, 9], "global_cfg": 4, "its": [4, 5, 7, 9], "consequ": 4, "remoteformatt": [4, 13], "repositori": 4, "hub": 4, "textformatt": [4, 13], "txt": [4, 8], "pdf": [4, 8], "cpp": 4, "docx": [4, 8], "md": 4, "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "c": 4, "h": [4, 8, 9], "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 9], "java": 4, "j": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "py": 4, "rb": 4, "r": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "t": [4, 6, 7], "tsx": 4, "vb": 4, "makefil": 4, "xml": 4, "rst": 4, "m": [4, 9], "smali": 4, "datas": 4, "unified_format_dataset": 4, "parquetformatt": [4, 13], "csvformatt": [4, 13], "tsvformatt": [4, 13], "tsv": 4, "delimit": 4, "mixtureformatt": [4, 13], "max_sampl": 4, "mix": 4, "randomli": [4, 9], "everi": 4, "them": [4, 7, 8], "datasset": 4, "dir": 4, "w1": 4, "d": 4, "w2": 4, "ds_dir": 4, "w3": 4, "ds_file": 4, "max": [4, 7, 8, 9], "random_sampl": 4, "sample_numb": 4, "seed": 4, "bigger": [4, 9], "than": [4, 6, 7, 8, 9], "we": [4, 7, 8, 9, 13], "instead": [4, 6], "random": [4, 9], "42": 4, "load_op": [5, 13], "process_list": 5, "op_fus": 5, "item": 5, "fuse": 5, "share": 5, "same": 5, "intermedi": [5, 7, 8], "instanc": 5, "image_kei": 5, "audio_kei": 5, "audio": [5, 8, 9], "video_kei": [5, 9], "video": [5, 7, 8, 9], "compute_stat": [5, 7, 8], "context": [5, 7, 8, 9], "metric": [5, 7, 8], "decid": [5, 7, 8], "var": [5, 7, 8], "temporarili": [5, 7, 8], "For": [5, 7, 8, 9], "level": [5, 6, 7, 8, 9, 10], "boolean": [5, 7, 8], "conduct": 5, "edit": 5, "compute_hash": [5, 7], "doc": [5, 7], "open": [5, 7, 9], "selector": [5, 13], "get_sentences_from_docu": [6, 13], "document": [6, 7, 8, 9], "model_func": 6, "sentenc": [6, 9], "need": [6, 8, 9, 10], "split": [6, 9], "splite": 6, "separ": [6, 8, 10], "n": [6, 8, 9], "get_words_from_docu": [6, 13], "token_func": 6, "new_lin": 6, "tab": 6, "word": [6, 8, 9], "like": [6, 7, 8, 9], "stopword": [6, 8], "token": [6, 7, 8, 9], "merge_on_whitespace_tab_newlin": [6, 13], "invert": 6, "split_on_newline_tab_whitespac": [6, 13], "concaten": [6, 9], "first": [6, 7, 8, 9], "split_on_whitespac": [6, 13], "also": 6, "space": [6, 7], "tag": [6, 8, 9], "strip": [6, 13], "strip_charact": 6, "wai": [6, 9], "faster": 6, "sinc": 6, "now": [6, 9], "set": [6, 8, 9, 10], "contain": [6, 8, 9], "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "words_augment": [6, 13], "group_siz": 6, "join_char": 6, "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 9], "vietnames": [6, 8], "syllabl": 6, "group": [6, 8], "ad": [6, 9], "words_refin": [6, 13], "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "2": [6, 8, 9], "words_aug_join_char": [6, 8], "refin": 6, "non": [6, 7, 9], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7, 9], "lower": [6, 7, 8, 9], "case": [6, 7, 8, 9, 13], "lowercas": [6, 7, 9], "char": [6, 8, 9], "videodedupl": [7, 13], "consider_text": 7, "bool": [7, 8, 9, 10], "exact": 7, "match": [7, 8, 9], "consid": [7, 8, 9], "togeth": [7, 9], "raybasicdedupl": [7, 13], "redis_host": 7, "localhost": 7, "redis_port": 7, "positiveint": [7, 8, 9, 10], "6380": 7, "basic": 7, "rai": 7, "although": 7, "implement": 7, "empty_hash_valu": 7, "empti": [7, 9], "hostnam": 7, "redi": 7, "server": 7, "port": 7, "calculate_hash": 7, "calcul": [7, 8], "documentminhashdedupl": [7, 13], "window_s": 7, "5": [7, 8, 9], "ignore_pattern": 7, "num_permut": 7, "256": 7, "jaccard_threshold": 7, "closedunitinterv": [7, 8, 9, 10], "7": [7, 9], "num_band": 7, "num_rows_per_band": 7, "tokenizer_model": 7, "minhashlsh": 7, "simhash": 7, "minhash": 7, "byte": [7, 8], "so": [7, 8, 9], "thei": 7, "won": 7, "kept": [7, 8], "final": [7, 9], "should": [7, 8, 9], "punctuat": [7, 9], "sentencepiec": 7, "english": [7, 8, 9], "recommend": [7, 9], "pleas": 7, "provid": [7, 9], "shingl": 7, "ignor": [7, 9], "string": [7, 8, 9], "pattern": [7, 9], "permut": 7, "min": [7, 8, 9], "jaccard": 7, "similar": [7, 8, 9], "threshold": [7, 8, 9], "detect": [7, 8, 9], "regard": 7, "onli": [7, 8, 9], "band": 7, "lsh": 7, "determin": [7, 10], "optim": 7, "minim": 7, "sum": 7, "prob": 7, "posit": [7, 8, 9], "neg": [7, 9], "row": 7, "rayimagededupl": [7, 13], "phash": 7, "raydocumentdedupl": [7, 13], "ignore_non_charact": 7, "alphabet": [7, 8, 9], "whitespac": [7, 9], "digit": 7, "documentdedupl": [7, 13], "md5": 7, "imagededupl": [7, 13], "documentsimhashdedupl": [7, 13], "6": [7, 8], "num_block": 7, "hamming_dist": 7, "4": [7, 8, 9], "And": 7, "block": 7, "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8, 9], "rayvideodedupl": [7, 13], "imagetextsimilarityfilt": [8, 13], "hf_clip": 8, "openai": 8, "clip": [8, 9], "vit": 8, "patch32": 8, "min_scor": 8, "max_scor": 8, "horizontal_flip": [8, 9], "vertical_flip": [8, 9], "any_or_al": [8, 9], "ani": [8, 9], "reduce_mod": 8, "avg": 8, "those": 8, "within": [8, 9, 10], "rang": [8, 9], "flip": [8, 9], "horizont": [8, 9], "left": [8, 9], "right": [8, 9], "vertic": [8, 9], "top": [8, 9, 10], "bottom": [8, 9], "strategi": [8, 9], "meet": [8, 9], "condit": [8, 9], "reduc": [8, 9], "mode": [8, 9], "correspond": [8, 10], "chunk": 8, "take": 8, "averag": 8, "rank": [8, 9], "videoaspectratiofilt": [8, 13], "min_ratio": [8, 9], "9": [8, 9], "21": [8, 9], "max_ratio": [8, 9], "aspect": [8, 9], "aspectratio": [8, 9], "w": [8, 9], "minimum": [8, 9], "support": [8, 9], "imagetextmatchingfilt": [8, 13], "hf_blip": 8, "salesforc": [8, 9], "blip": [8, 9], "itm": 8, "coco": 8, "003": 8, "score": 8, "imagensfwfilt": [8, 13], "hf_nsfw_model": 8, "falconsai": 8, "nsfw_image_detect": 8, "score_threshold": 8, "have": 8, "low": 8, "nsfw": 8, "tokennumfilt": [8, 13], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "9223372036854775807": [8, 9], "total": [8, 9], "hug": 8, "face": [8, 9], "below": [8, 9], "exce": [8, 9], "textlengthfilt": [8, 13], "min_len": [8, 9], "max_len": [8, 9], "length": [8, 9], "specifiednumericfieldfilt": [8, 13], "field_kei": [8, 10], "min_valu": 8, "max_valu": 8, "numer": 8, "multi": [8, 10, 13], "specifiednumericfield": 8, "audionmfsnrfilt": [8, 13], "min_snr": 8, "max_snr": 8, "nmf_iter_num": 8, "500": [8, 9], "snr": 8, "nmf": 8, "db": 8, "sy": 8, "maxsiz": 8, "iter": [8, 9], "videoaestheticsfilt": [8, 13], "hf_scorer_model": 8, "frame_sampling_method": [8, 9], "frame_num": [8, 9], "3": [8, 9], "aesthet": 8, "frame": [8, 9], "predictor": 8, "By": 8, "shunk031": 8, "v2": 8, "sac": 8, "logo": 8, "ava1": 8, "l14": 8, "linearms": 8, "refer": [8, 9], "pypi": 8, "org": [8, 9], "project": 8, "simpl": [8, 9], "predict": 8, "all_keyfram": [8, 9], "former": [8, 9], "latter": [8, 9], "uniformli": [8, 9], "keyfram": 8, "larg": 8, "while": 8, "usual": 8, "small": 8, "term": 8, "middl": [8, 9], "last": [8, 9], "larger": [8, 9], "addit": [8, 9], "durat": [8, 9], "must": [8, 9], "keyword": [8, 9], "perplexityfilt": [8, 13], "lang": [8, 9], "max_ppl": 8, "positivefloat": 8, "1500": 8, "perplex": 8, "phrasegroundingrecallfilt": [8, 13], "hf_owlvit": 8, "googl": 8, "owlvit": 8, "min_recal": 8, "max_recal": 8, "iou_thr": 8, "large_area_ratio_thr": 8, "95": 8, "conf_thr": 8, "locat": [8, 9], "recal": 8, "phrase": 8, "owl": 8, "ground": 8, "iou": 8, "nm": 8, "post": 8, "bbox": 8, "overlap": 8, "confid": 8, "area": 8, "out": 8, "account": 8, "more": [8, 9, 13], "maximumlinelengthfilt": [8, 13], "averagelinelengthfilt": [8, 13], "specifiedfieldfilt": [8, 13], "target_valu": 8, "retain": [8, 9], "videotaggingfromframesfilt": [8, 13], "peopl": 8, "shift": 8, "found": [8, 9], "http": [8, 9], "github": 8, "com": 8, "xinyu1205": 8, "recogn": 8, "anyth": 8, "blob": 8, "main": [8, 9], "ram": 8, "ram_tag_list": 8, "noqa": 8, "e501": 8, "requir": 8, "equal": [8, 9], "depend": [8, 9], "textentitydependencyfilt": [8, 13], "min_dependency_num": 8, "identifi": [8, 9], "entiti": 8, "omit": 8, "zh": 8, "mini_dependency_num": 8, "edg": 8, "objet": 8, "videoresolutionfilt": [8, 13], "min_width": [8, 9], "max_width": [8, 9], "min_height": [8, 9], "max_height": [8, 9], "resolut": [8, 9], "alphanumericfilt": [8, 13], "25": 8, "count": 8, "alphanumer": 8, "imagewatermarkfilt": [8, 13], "hf_watermark_model": 8, "amrul": 8, "hzz": 8, "watermark_detector": 8, "prob_threshold": 8, "8": [8, 9], "watermark": [8, 9], "high": 8, "probabl": [8, 9], "imageaestheticsfilt": [8, 13], "audiosizefilt": [8, 13], "min_siz": 8, "max_siz": 8, "1tb": 8, "kb": 8, "mb": 8, "constraint": 8, "approxim": 8, "un": 8, "limit": 8, "stopwordsfilt": [8, 13], "stopwords_dir": 8, "home": 8, "runner": 8, "asset": 8, "what": 8, "adopt": 8, "avail": 8, "join": 8, "characterrepetitionfilt": [8, 13], "rep_len": 8, "gram": 8, "repetit": 8, "imageshapefilt": [8, 13], "shape": 8, "width": [8, 9], "height": [8, 9], "videodurationfilt": [8, 13], "min_dur": 8, "nonnegativeint": [8, 9], "max_dur": 8, "second": [8, 9], "textactionfilt": [8, 13], "min_action_num": 8, "action": 8, "mini_action_num": 8, "videoocrarearatiofilt": [8, 13], "min_area_ratio": 8, "max_area_ratio": 8, "frame_sample_num": 8, "languages_to_detect": 8, "ch_sim": 8, "ocr": [8, 9], "evenli": 8, "full": [8, 9], "here": [8, 9, 13], "www": 8, "jaid": 8, "ai": [8, 9], "easyocr": 8, "get_read": 8, "videonsfwfilt": [8, 13], "specialcharactersfilt": [8, 13], "videoframestextsimilarityfilt": [8, 13], "kind": [8, 9], "relat": 8, "exampl": 8, "chineseclip": 8, "might": [8, 9], "choic": 8, "imageaspectratiofilt": [8, 13], "333": 8, "audiodurationfilt": [8, 13], "languageidscorefilt": [8, 13], "identif": 8, "suffixfilt": [8, 13], "imagesizefilt": [8, 13], "videowatermarkfilt": [8, 13], "wordsnumfilt": [8, 13], "imagefaceratiofilt": [8, 13], "largest": 8, "flaggedwordfilt": [8, 13], "045": 8, "flagged_words_dir": 8, "flag": 8, "flagged_word": 8, "wordrepetitionfilt": [8, 13], "videomotionscorefilt": [8, 13], "7976931348623157e": 8, "308": 8, "sampling_fp": 8, "sequenc": [8, 9], "rel": 8, "motion": 8, "farneback": 8, "algorith": 8, "opencv": 8, "dens": 8, "optic": 8, "flow": 8, "rate": 8, "frames_per_second": 8, "resiz": [8, 9], "smaller": [8, 9, 10], "rescal": 8, "allow": [8, 9], "longer": 8, "greater": [8, 9], "being": [8, 9], "overrul": 8, "As": 8, "mai": 8, "shorter": [8, 9], "magnitud": 8, "normal": [8, 9], "diagon": 8, "videocaptioningfromaudiomapp": [9, 13], "keep_original_sampl": 9, "caption": 9, "stream": 9, "qwen": 9, "videotaggingfromaudiomapp": [9, 13], "hf_ast": 9, "mit": 9, "ast": 9, "finetun": 9, "audioset": 9, "4593": 9, "spectrogram": 9, "transform": 9, "imagecaptioningfromgpt4vmapp": [9, 13], "descript": 9, "api_kei": 9, "max_token": 9, "temperatur": 9, "system_prompt": 9, "user_prompt": 9, "user_prompt_kei": 9, "gpt": 9, "visison": 9, "reson": 9, "convers": 9, "custom": 9, "api": 9, "authent": 9, "request": 9, "control": 9, "output": 9, "prompt": 9, "guidanc": [9, 13], "rule": [9, 10], "gpt4": 9, "vision": 9, "respons": 9, "guid": 9, "uers_prompt_kei": 9, "punctuationnormalizationmapp": [9, 13], "unicod": 9, "removebibliographymapp": [9, 13], "bibliographi": 9, "end": 9, "latex": 9, "sentencesplitmapp": [9, 13], "videosplitbyscenemapp": [9, 13], "detector": 9, "contentdetector": 9, "nonnegativefloat": 9, "27": 9, "min_scene_len": 9, "15": 9, "show_progress": 9, "cut": 9, "scene": 9, "avaliable_detector": 9, "adaptivedetector": 9, "window_width": 9, "min_content_v": 9, "luma_onli": 9, "kernel_s": 9, "video_manag": 9, "min_delta_hsv": 9, "thresholddetector": 9, "fade_bia": 9, "add_final_scen": 9, "block_siz": 9, "scenedetect": 9, "pass": 9, "progress": 9, "cleanipmapp": [9, 13], "repl": 9, "clean": 9, "ipv4": 9, "ipv6": 9, "address": 9, "regular": 9, "express": 9, "search": [9, 13], "replac": 9, "cleanlinksmapp": [9, 13], "ftp": 9, "removeheadermapp": [9, 13], "drop_no_head": 9, "header": 9, "begin": 9, "drop": 9, "removetabletextmapp": [9, 13], "min_col": 9, "from_2_to_20": 9, "max_col": 9, "20": 9, "videoremovewatermarkmapp": [9, 13], "roi_str": 9, "roi_typ": 9, "roi_kei": 9, "min_frame_threshold": 9, "detection_method": 9, "pixel_valu": 9, "region": 9, "x1": 9, "y1": 9, "x2": 9, "y2": 9, "roi": 9, "pixel": 9, "corner": 9, "coordin": 9, "wight": 9, "coodin": 9, "pixel_divers": 9, "useless": 9, "removerepeatsentencesmapp": [9, 13], "ignore_special_charact": 9, "min_repeat_sentence_length": 9, "repeat": 9, "judg": 9, "except": 9, "letter": 9, "imagediffusionmapp": [9, 13], "hf_diffus": 9, "compvi": 9, "stabl": 9, "diffus": 9, "v1": 9, "torch_dtyp": 9, "fp32": 9, "revis": 9, "strength": 9, "guidance_scal": 9, "aug_num": 9, "caption_kei": 9, "hf_img2seq": 9, "blip2": 9, "opt": 9, "7b": 9, "point": 9, "fp16": 9, "bf16": 9, "version": 9, "branch": 9, "commit": 9, "id": 9, "git": 9, "extent": 9, "start": 9, "nois": 9, "higher": 9, "denois": 9, "step": 9, "amount": 9, "num_inference_step": 9, "essenti": 9, "scale": 9, "encourag": 9, "close": 9, "expens": 9, "qualiti": 9, "enabl": 9, "produc": 9, "keep_candidate_mod": 9, "caption_num": 9, "candid": 9, "random_ani": 9, "similar_one_simhash": 9, "batched_op": 9, "both": [9, 10], "suppos": 9, "batch": 9, "b": 9, "denot": 9, "2nb": 9, "nb": 9, "mnb": 9, "otherwis": 9, "imagefaceblurmapp": [9, 13], "blur_typ": 9, "gaussian": 9, "radiu": 9, "blur": 9, "kernel": 9, "videoffmpegwrappedmapp": [9, 13], "filter_nam": 9, "filter_kwarg": 9, "global_arg": 9, "capture_stderr": 9, "overwrite_output": 9, "wrapper": 9, "ffmpeg": 9, "captur": 9, "stderr": 9, "chineseconvertmapp": [9, 13], "s2t": 9, "tradit": 9, "simplifi": 9, "japanes": 9, "kanji": 9, "choos": 9, "t2": 9, "s2tw": 9, "taiwan": 9, "standard": 9, "tw2": 9, "s2hk": 9, "hong": 9, "kong": 9, "variant": 9, "hk2": 9, "s2twp": 9, "taiwanes": 9, "idiom": 9, "tw2sp": 9, "mainland": 9, "t2tw": 9, "tw2t": 9, "hk2t": 9, "t2hk": 9, "t2jp": 9, "ky\u016bjitai": 9, "jp2t": 9, "shinjitai": 9, "nlpcdazhmapp": [9, 13], "sequenti": 9, "replace_similar_word": 9, "replace_homophone_char": 9, "delete_random_char": 9, "swap_random_char": 9, "replace_equivalent_num": 9, "simpli": 9, "nlpcda": 9, "librari": 9, "you": 9, "time": 9, "semant": 9, "significantli": 9, "notic": 9, "combin": 9, "would": 9, "opened_aug_method": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "delet": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "swap": 9, "contigu": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "imageblurmapp": [9, 13], "p": 9, "blure": 9, "cleancopyrightmapp": [9, 13], "copyright": 9, "comment": 9, "removenonchinesecharacterlmapp": [9, 13], "keep_alphabet": 9, "keep_numb": 9, "keep_punc": 9, "videosplitbykeyframemapp": [9, 13], "get_split_key_fram": 9, "removespecificcharsmapp": [9, 13], "chars_to_remov": 9, "videoresizeaspectratiomapp": [9, 13], "increas": 9, "decreas": 9, "enforc": 9, "abov": 9, "adjust": 9, "dimens": 9, "either": 9, "enlarg": 9, "accept": 9, "cleanhtmlmapp": [9, 13], "whitespacenormalizationmapp": [9, 13], "0x20": 9, "wikipedia": 9, "wiki": 9, "whitespace_charact": 9, "videotaggingfromframesmapp": [9, 13], "removecommentsmapp": [9, 13], "doc_typ": 9, "inlin": 9, "multilin": 9, "expandmacromapp": [9, 13], "expand": 9, "macro": 9, "definit": 9, "bodi": 9, "extractqamapp": [9, 13], "hf_model": 9, "alibaba": 9, "pai": 9, "qwen1_5": 9, "doc2qa": 9, "qa_format": 9, "chatml": 9, "question": 9, "answer": 9, "llama3": 9, "8b": 9, "baichuan2": 9, "4b": 9, "1b8": 9, "0b5": 9, "These": 9, "train": 9, "suitabl": 9, "hugginfac": 9, "interfac": 9, "follow": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "ulaanbaatar": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "reykjavik": 9, "human": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "assist": 9, "\u4f60\u597d": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "imagecaptioningmapp": [9, 13], "prompt_kei": 9, "anoth": 9, "how": 9, "mani": 9, "similar_on": 9, "removewordswithincorrectsubstringsmapp": [9, 13], "substr": 9, "incorrect": 9, "should_keep_word_with_incorrect_substr": 9, "videocaptioningfromvideomapp": [9, 13], "hf_video_blip": 9, "kpyu": 9, "ego4d": 9, "videocaptioningfromsummarizermapp": [9, 13], "hf_summar": 9, "consider_video_caption_from_video": 9, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "vid_cap_from_vid_arg": 9, "vid_cap_from_frm_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "keep_tag_num": 9, "summar": 9, "directli": 9, "too": 9, "bring": 9, "influenc": 9, "frequent": 9, "fixunicodemapp": [9, 13], "fix": 9, "error": 9, "form": 9, "nfc": 9, "nfkc": 9, "nfd": 9, "nfkd": 9, "nlpaugenmapp": [9, 13], "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "insert_random_char": 9, "nlpaug": 9, "love": 9, "llm": 9, "simul": 9, "spell": 9, "ll": 9, "keyboard": 9, "ov4": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "videocaptioningfromframesmapp": [9, 13], "removelongwordsmapp": [9, 13], "long": 9, "should_keep_long_word": 9, "videoresizeresolutionmapp": [9, 13], "force_original_aspect_ratio": 9, "disabl": 9, "force_divisible_bi": 9, "leav": 9, "super": 9, "deep": 9, "learn": 9, "futur": 9, "necessari": 9, "ensur": 9, "divis": 9, "integ": 9, "even": 9, "cleanemailmapp": [9, 13], "email": 9, "replacecontentmapp": [9, 13], "design": 9, "audioffmpegwrappedmapp": [9, 13], "videosplitbydurationmapp": [9, 13], "split_dur": 9, "min_last_split_dur": 9, "discard": 9, "split_videos_by_dur": 9, "videofaceblurmapp": [9, 13], "frequencyspecifiedfieldselector": [10, 13], "top_ratio": 10, "topk": 10, "sort": 10, "frequenc": 10, "descend": 10, "topkspecifiedfieldselector": [10, 13], "give": 13, "kdd": 13, "24": 13, "modal": 13, "foundat": 13, "practic": 13, "see": 13, "detail": 13, "data_juic": 13, "core": 13, "index": 13, "page": 13}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [3, 0, 0, "-", "core"], [0, 3, 1, "", "cuda_device_count"], [4, 0, 0, "-", "format"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"]], "data_juicer.core": [[3, 1, 1, "", "Analyzer"], [3, 1, 1, "", "Executor"], [3, 1, 1, "", "Exporter"], [3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "Tracer"]], "data_juicer.core.Analyzer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"], [3, 2, 1, "", "sample_data"]], "data_juicer.core.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "export_compute_stats"], [3, 2, 1, "", "to_json"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "map"], [3, 2, 1, "", "process"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 1, 1, "", "CsvFormatter"], [4, 1, 1, "", "JsonFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "MixtureFormatter"], [4, 1, 1, "", "ParquetFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 1, 1, "", "TextFormatter"], [4, 1, 1, "", "TsvFormatter"], [4, 3, 1, "", "load_formatter"]], "data_juicer.format.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 2, 1, "", "random_sample"]], "data_juicer.format.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "ExtractQAMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ExtractQAMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"]}, "titleterms": {"data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "analysi": 1, "config": 2, "core": 3, "format": 4, "op": [5, 6, 7, 8, 9, 10], "common": 6, "dedupl": 7, "filter": 8, "mapper": 9, "selector": 10, "tool": 11, "util": 12, "welcom": 13, "data": 13, "juicer": 13, "": 13, "document": 13, "tutori": 13, "api": 13, "refer": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"data_juicer": [[0, "module-data_juicer"], [14, "data-juicer"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "Tutorial": [[13, "tutorial"]], "API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]]}, "indexentries": {"cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count"]], "data_juicer": [[0, "module-data_juicer"]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available"]], "module": [[0, "module-data_juicer"], [1, "module-data_juicer.analysis"], [2, "module-data_juicer.config"], [3, "module-data_juicer.core"], [4, "module-data_juicer.format"], [5, "module-data_juicer.ops"], [6, "module-data_juicer.ops.common"], [7, "module-data_juicer.ops.deduplicator"], [8, "module-data_juicer.ops.filter"], [9, "module-data_juicer.ops.mapper"], [10, "module-data_juicer.ops.selector"], [11, "module-data_juicer.tools"], [12, "module-data_juicer.utils"]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__"]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze"]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze"]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze"]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist"]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config"]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs"]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config"]], "analyzer (class in data_juicer.core)": [[3, "data_juicer.core.Analyzer"]], "executor (class in data_juicer.core)": [[3, "data_juicer.core.Executor"]], "exporter (class in data_juicer.core)": [[3, "data_juicer.core.Exporter"]], "gib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.GiB"]], "kib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.KiB"]], "mib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.MiB"]], "nesteddataset (class in data_juicer.core)": [[3, "data_juicer.core.NestedDataset"]], "tib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.TiB"]], "tracer (class in data_juicer.core)": [[3, "data_juicer.core.Tracer"]], "__init__() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.__init__"]], "__init__() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.__init__"]], "__init__() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.__init__"]], "__init__() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.__init__"]], "__init__() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.__init__"]], "add_column() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.add_column"]], "cleanup_cache_files() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.cleanup_cache_files"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "export() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export"]], "export_compute_stats() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export_compute_stats"]], "filter() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.filter"]], "from_dict() (data_juicer.core.nesteddataset class method)": [[3, "data_juicer.core.NestedDataset.from_dict"]], "map() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.map"]], "process() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.process"]], "remove_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.run"]], "run() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.run"]], "sample_data() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.sample_data"]], "select() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select"]], "select_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select_columns"]], "to_json() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_json"]], "to_jsonl() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_mapper"]], "csvformatter (class in data_juicer.format)": [[4, "data_juicer.format.CsvFormatter"]], "jsonformatter (class in data_juicer.format)": [[4, "data_juicer.format.JsonFormatter"]], "localformatter (class in data_juicer.format)": [[4, "data_juicer.format.LocalFormatter"]], "mixtureformatter (class in data_juicer.format)": [[4, "data_juicer.format.MixtureFormatter"]], "parquetformatter (class in data_juicer.format)": [[4, "data_juicer.format.ParquetFormatter"]], "remoteformatter (class in data_juicer.format)": [[4, "data_juicer.format.RemoteFormatter"]], "suffixes (data_juicer.format.csvformatter attribute)": [[4, "data_juicer.format.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.jsonformatter attribute)": [[4, "data_juicer.format.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquetformatter attribute)": [[4, "data_juicer.format.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.textformatter attribute)": [[4, "data_juicer.format.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsvformatter attribute)": [[4, "data_juicer.format.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format)": [[4, "data_juicer.format.TextFormatter"]], "tsvformatter (class in data_juicer.format)": [[4, "data_juicer.format.TsvFormatter"]], "__init__() (data_juicer.format.csvformatter method)": [[4, "data_juicer.format.CsvFormatter.__init__"]], "__init__() (data_juicer.format.jsonformatter method)": [[4, "data_juicer.format.JsonFormatter.__init__"]], "__init__() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.__init__"]], "__init__() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.__init__"]], "__init__() (data_juicer.format.parquetformatter method)": [[4, "data_juicer.format.ParquetFormatter.__init__"]], "__init__() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.__init__"]], "__init__() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.__init__"]], "__init__() (data_juicer.format.tsvformatter method)": [[4, "data_juicer.format.TsvFormatter.__init__"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "load_dataset() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format)": [[4, "data_juicer.format.load_formatter"]], "random_sample() (data_juicer.format.mixtureformatter class method)": [[4, "data_juicer.format.MixtureFormatter.random_sample"]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator"]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter"]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper"]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector"]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__"]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__"]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__"]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__"]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops"]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process"]], "process() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process"]], "process() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process"]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process"]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run"]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run"]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run"]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace"]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip"]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator"]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE"]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator"]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator"]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator"]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator"]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator"]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__"]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process"]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter"]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter"]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter"]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter"]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter"]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter"]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter"]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter"]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter"]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter"]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter"]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter"]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter"]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter"]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter"]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter"]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter"]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter"]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter"]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter"]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter"]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter"]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter"]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter"]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter"]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter"]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter"]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter"]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter"]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__"]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__"]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__"]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__"]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__"]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__"]], "compute_stats() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader"]], "process() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process"]], "process() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process"]], "process() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process"]], "process() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process"]], "process() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process"]], "process() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process"]], "process() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process"]], "process() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process"]], "process() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process"]], "process() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process"]], "process() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process"]], "process() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process"]], "process() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process"]], "process() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process"]], "process() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process"]], "process() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process"]], "process() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process"]], "process() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process"]], "process() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process"]], "process() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process"]], "process() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process"]], "process() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process"]], "process() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process"]], "process() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process"]], "process() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process"]], "process() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process"]], "process() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process"]], "process() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process"]], "process() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process"]], "process() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process"]], "process() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process"]], "process() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process"]], "process() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process"]], "process() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process"]], "process() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process"]], "process() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process"]], "process() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process"]], "process() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process"]], "process() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process"]], "process() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process"]], "process() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process"]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper"]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper"]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper"]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper"]], "extractqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractQAMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper"]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper"]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper"]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper"]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper"]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper"]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper"]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper"]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY"]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper"]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper"]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper"]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper"]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper"]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper"]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper"]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper"]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper"]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper"]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper"]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper"]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper"]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper"]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__"]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame"]], "process() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process"]], "process() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process"]], "process() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process"]], "process() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process"]], "process() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process"]], "process() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process"]], "process() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process"]], "process() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process"]], "process() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.process"]], "process() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process"]], "process() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process"]], "process() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process"]], "process() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process"]], "process() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process"]], "process() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process"]], "process() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process"]], "process() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process"]], "process() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process"]], "process() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process"]], "process() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process"]], "process() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process"]], "process() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process"]], "process() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process"]], "process() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process"]], "process() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process"]], "process() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process"]], "process() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process"]], "process() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process"]], "process() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process"]], "process() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process"]], "process() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process"]], "process() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process"]], "process() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process"]], "process() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process"]], "process() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process"]], "process() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process"]], "process() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process"]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]]}})
\ No newline at end of file
+Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"cuda_device_count": [0, 14], "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "is_cuda_avail": [0, 14], "class": [1, 3, 4, 5, 7, 8, 9, 10], "columnwiseanalysi": [1, 3, 13], "dataset": [1, 3, 4, 5, 7, 8, 9, 10], "output_path": 1, "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "save_stats_in_one_fil": 1, "true": [1, 2, 3, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 7, 8, 9, 10], "object": [1, 2, 3, 8], "appli": [1, 3, 7, 9, 10], "each": [1, 3, 5, 7, 9], "column": [1, 3, 9], "stat": [1, 3, 5, 7, 8], "respect": [1, 9], "__init__": [1, 3, 4, 5, 7, 8, 9, 10], "initi": [1, 2, 3, 4, 7, 8, 9, 10], "method": [1, 3, 4, 6, 7, 8, 9, 10], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "analyz": [1, 2, 3, 13], "path": [1, 2, 3, 4, 7], "store": [1, 3, 4, 5, 7, 8, 9], "result": [1, 3, 8], "option": [1, 3, 4], "precomput": 1, "overal": 1, "whether": [1, 2, 3, 4, 5, 6, 7, 8, 9], "save": [1, 2, 3], "all": [1, 3, 6, 8, 9], "figur": [1, 3, 9], "one": [1, 2, 6, 7, 8, 9], "imag": [1, 5, 7, 8, 9], "file": [1, 2, 3, 4, 5, 8, 9], "show_percentil": 1, "fals": [1, 2, 3, 4, 5, 6, 7, 8, 9], "show": [1, 3, 9], "skip_export": [1, 3], "draw": 1, "percentil": [1, 10], "line": [1, 2, 8, 9], "sub": [1, 6, 7], "If": [1, 3, 7, 8, 9], "": [1, 3, 7, 8, 9], "sever": [1, 9], "red": 1, "indic": [1, 9], "quantil": 1, "distribut": [1, 3, 9], "singl": [1, 3, 9], "window": [1, 7], "after": [1, 3, 6, 7, 8, 9], "disk": [1, 3], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "draw_hist": 1, "ax": 1, "data": [1, 4, 5, 8, 9], "save_path": 1, "histogram": 1, "includ": [1, 7, 8, 9], "inform": [1, 5, 7, 8, 10], "draw_box": 1, "box": [1, 9], "plot": 1, "diversityanalysi": [1, 13], "lang_or_model": 1, "en": [1, 8, 9], "divers": [1, 9], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "get": [1, 6], "an": [1, 3, 4, 5, 7, 8, 9], "param": [1, 2, 4, 6, 7, 9], "model": [1, 6, 7, 8, 9, 13], "specif": [1, 3, 5, 7, 8, 9], "languag": [1, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 13], "load": [1, 3, 4, 5, 9], "comput": [1, 3, 5, 6, 7, 8], "column_nam": 1, "text": [1, 4, 5, 7, 8, 9], "lexic": 1, "tree": [1, 8], "name": [1, 3, 4, 5, 8, 9], "postproc_func": 1, "function": [1, 6, 7], "get_divers": 1, "postproc_kwarg": 1, "whole": [1, 8], "In": [1, 3], "default": [1, 2, 3, 4, 7, 8, 9], "argument": [1, 3, 5, 8, 9], "overallanalysi": [1, 3, 13], "mean": [1, 3, 9], "std": 1, "etc": [1, 3, 4], "refine_single_column": 1, "col": 1, "num_proc": [1, 3, 4], "1": [1, 3, 4, 8, 9], "describ": 1, "panda": 1, "number": [1, 3, 4, 5, 7, 8, 9, 10], "process": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13], "export": [1, 3, 4, 5, 13], "init_config": [2, 13], "arg": [2, 3, 4, 5, 7, 8, 9, 10], "jsonargpars": 2, "parser": 2, "pars": [2, 9], "from": [2, 3, 4, 5, 6, 7, 8, 9, 10], "posix": 2, "style": 2, "command": [2, 4, 9], "yaml": 2, "json": [2, 3, 4, 8], "jsonnet": 2, "superset": 2, "environ": 2, "variabl": [2, 5], "hard": 2, "code": [2, 9], "list": [2, 3, 4, 5, 6, 8, 9], "e": [2, 3, 4, 8, 9], "g": [2, 3, 4, 9], "conifg": 2, "cfg": [2, 3, 4], "defaut": 2, "global": [2, 4, 9], "executor": [2, 3, 13], "export_config": [2, 13], "format": [2, 3, 8, 9, 13], "skip_non": 2, "skip_check": 2, "overwrit": [2, 9], "multifil": 2, "some": [2, 9], "ar": [2, 3, 6, 7, 8, 9, 10], "namespac": 2, "type": [2, 4, 9], "json_ind": 2, "parser_mod": 2, "exclud": 2, "entri": 2, "whose": [2, 8, 9], "valu": [2, 5, 7, 8, 9, 10], "i": [2, 3, 4, 5, 6, 7, 8, 9], "skip": 2, "check": 2, "exist": 2, "multipl": [2, 3, 4, 6, 7, 8], "__path__": 2, "meta": [2, 4], "merge_config": [2, 13], "ori_cfg": 2, "new_cfg": 2, "dict": [2, 3, 9], "merg": [2, 4, 6, 8], "configur": 2, "origin": [2, 3, 8, 9], "expect": [2, 9], "cfg_after_merg": 2, "thi": [3, 5, 6, 7, 8, 9, 10], "It": [3, 7, 8, 9], "filter": [3, 5, 7, 9, 13], "op": [3, 13], "config": [3, 5, 13], "analysi": [3, 13], "gener": [3, 9], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "input": [3, 5, 7, 8, 9, 10], "better": [3, 8], "run": [3, 5, 8, 9], "load_data_np": 3, "pipelin": 3, "worker": 3, "when": [3, 4, 5, 7, 8, 9, 10], "nesteddataset": [3, 13], "karg": 3, "djdataset": 3, "enhanc": 3, "huggingfac": [3, 4, 8, 9], "usabl": 3, "effici": 3, "oper": 3, "checkpoint": 3, "tracer": [3, 5, 7, 13], "map": [3, 9], "overrid": 3, "func": 3, "which": [3, 5, 7, 8, 9], "call": 3, "most": [3, 9], "common": [3, 13], "can": [3, 8, 9], "access": 3, "nest": 3, "manner": 3, "select": [3, 4, 5, 8, 10], "classmethod": [3, 4], "from_dict": 3, "from_xx": 3, "constructor": 3, "construct": 3, "add_column": 3, "add": [3, 4], "select_column": 3, "remove_column": 3, "remov": [3, 5, 6, 8, 9], "cleanup_cache_fil": 3, "clear": 3, "raw": 3, "compress": 3, "cach": [3, 8], "unifi": [3, 4], "order": [3, 10], "sample_data": 3, "dataset_to_sampl": 3, "sample_ratio": 3, "float": [3, 8, 9], "0": [3, 4, 5, 7, 8, 9], "sample_algo": 3, "str": [3, 4, 6, 7, 8, 9, 10], "uniform": [3, 8, 9], "kwarg": [3, 4, 5, 7, 8, 9, 10], "subset": [3, 4], "given": [3, 8, 9], "formatt": [3, 4], "link": [3, 9], "The": [3, 4, 5, 8, 9, 10], "ratio": [3, 4, 6, 8, 9, 10], "size": [3, 6, 7, 8, 9], "algorithm": [3, 7, 9], "frequency_specified_field_selector": 3, "topk_specified_field_selector": 3, "A": [3, 5, 7, 9], "export_path": 3, "export_shard_s": 3, "export_in_parallel": 3, "export_d": 3, "keep_stats_in_res_d": 3, "keep_hashes_in_res_d": 3, "export_stat": 3, "kib": 3, "1024": 3, "mib": 3, "1048576": 3, "gib": 3, "1073741824": 3, "tib": 3, "1099511627776": 3, "shard": 3, "content": [3, 9], "keep": [3, 5, 7, 8, 9], "hash": [3, 5, 7], "export_compute_stat": 3, "statu": 3, "static": 3, "to_jsonl": 3, "jsonl": [3, 4], "target": [3, 8, 10], "extra": [3, 4, 7, 8, 9, 10], "to_json": 3, "to_parquet": 3, "parquet": [3, 4], "work_dir": 3, "show_num": [3, 5, 7], "10": [3, 8, 9], "trace": [3, 5, 7], "chang": [3, 9], "befor": [3, 8], "comparison": 3, "work": [3, 8, 9], "directori": [3, 4, 8], "maximum": [3, 8, 9], "trace_mapp": 3, "op_nam": 3, "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": 3, "mapper": [3, 5, 13], "mainli": 3, "differ": [3, 4, 6, 7, 8, 9], "pair": [3, 5, 7, 9], "due": 3, "modif": 3, "trace_batch_mapp": 3, "batchmapp": 3, "new": [3, 4, 9], "augment": [3, 6, 8, 9], "trace_filt": 3, "trace_dedupl": 3, "dup_pair": 3, "dedupl": [3, 5, 9, 13], "duplic": [3, 5, 7], "extract": [3, 8, 9], "other": [3, 8, 9], "two": [3, 7, 8], "embed": 3, "independ": [3, 8, 9], "obtain": [3, 6], "load_formatt": [4, 13], "dataset_path": 4, "suffix": [4, 8], "add_suffix": 4, "baseformatt": 4, "mixtur": 4, "weight": [4, 7, 9], "accord": [4, 5, 8, 9], "kei": [4, 5, 8, 9, 10], "field": [4, 5, 7, 8, 9, 10], "specifi": [4, 6, 8, 9, 10], "info": [4, 5], "jsonformatt": [4, 13], "localformatt": [4, 13], "zst": 4, "tupl": [4, 8], "local": 4, "packag": 4, "modul": [4, 13], "csv": 4, "load_dataset": 4, "int": [4, 8, 9], "global_cfg": 4, "its": [4, 5, 7, 9], "consequ": 4, "remoteformatt": [4, 13], "repositori": 4, "hub": 4, "textformatt": [4, 13], "txt": [4, 8], "pdf": [4, 8], "cpp": 4, "docx": [4, 8], "md": 4, "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "c": 4, "h": [4, 8, 9], "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 9], "java": 4, "j": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "py": 4, "rb": 4, "r": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "t": [4, 6, 7], "tsx": 4, "vb": 4, "makefil": 4, "xml": 4, "rst": 4, "m": [4, 9], "smali": 4, "datas": 4, "unified_format_dataset": 4, "parquetformatt": [4, 13], "csvformatt": [4, 13], "tsvformatt": [4, 13], "tsv": 4, "delimit": 4, "mixtureformatt": [4, 13], "max_sampl": 4, "mix": 4, "randomli": [4, 9], "everi": 4, "them": [4, 7, 8], "datasset": 4, "dir": 4, "w1": 4, "d": 4, "w2": 4, "ds_dir": 4, "w3": 4, "ds_file": 4, "max": [4, 7, 8, 9], "random_sampl": 4, "sample_numb": 4, "seed": 4, "bigger": [4, 9], "than": [4, 6, 7, 8, 9, 10], "we": [4, 7, 8, 9, 13], "instead": [4, 6], "random": [4, 9, 10], "42": 4, "load_op": [5, 13], "process_list": 5, "op_fus": 5, "item": 5, "fuse": 5, "share": 5, "same": 5, "intermedi": [5, 7, 8], "instanc": 5, "image_kei": 5, "audio_kei": 5, "audio": [5, 8, 9], "video_kei": [5, 9], "video": [5, 7, 8, 9], "compute_stat": [5, 7, 8], "context": [5, 7, 8, 9], "metric": [5, 7, 8], "decid": [5, 7, 8], "var": [5, 7, 8], "temporarili": [5, 7, 8], "For": [5, 7, 8, 9], "level": [5, 6, 7, 8, 9, 10], "boolean": [5, 7, 8], "conduct": 5, "edit": 5, "compute_hash": [5, 7], "doc": [5, 7], "open": [5, 7, 9], "selector": [5, 13], "get_sentences_from_docu": [6, 13], "document": [6, 7, 8, 9], "model_func": 6, "sentenc": [6, 9], "need": [6, 8, 9, 10], "split": [6, 9], "splite": 6, "separ": [6, 8, 10], "n": [6, 8, 9], "get_words_from_docu": [6, 13], "token_func": 6, "new_lin": 6, "tab": 6, "word": [6, 8, 9], "like": [6, 7, 8, 9], "stopword": [6, 8], "token": [6, 7, 8, 9], "merge_on_whitespace_tab_newlin": [6, 13], "invert": 6, "split_on_newline_tab_whitespac": [6, 13], "concaten": [6, 9], "first": [6, 7, 8, 9], "split_on_whitespac": [6, 13], "also": 6, "space": [6, 7], "tag": [6, 8, 9], "strip": [6, 13], "strip_charact": 6, "wai": [6, 9], "faster": 6, "sinc": 6, "now": [6, 9], "set": [6, 8, 9, 10], "contain": [6, 8, 9], "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "words_augment": [6, 13], "group_siz": 6, "join_char": 6, "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 9], "vietnames": [6, 8], "syllabl": 6, "group": [6, 8], "ad": [6, 9], "words_refin": [6, 13], "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "2": [6, 8, 9], "words_aug_join_char": [6, 8], "refin": 6, "non": [6, 7, 9], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7, 9], "lower": [6, 7, 8, 9, 10], "case": [6, 7, 8, 9, 13], "lowercas": [6, 7, 9], "char": [6, 8, 9], "videodedupl": [7, 13], "consider_text": 7, "bool": [7, 8, 9, 10], "exact": 7, "match": [7, 8, 9], "consid": [7, 8, 9], "togeth": [7, 9], "raybasicdedupl": [7, 13], "redis_host": 7, "localhost": 7, "redis_port": 7, "positiveint": [7, 8, 9, 10], "6380": 7, "basic": 7, "rai": 7, "although": 7, "implement": 7, "empty_hash_valu": 7, "empti": [7, 9], "hostnam": 7, "redi": 7, "server": 7, "port": 7, "calculate_hash": 7, "calcul": [7, 8], "documentminhashdedupl": [7, 13], "window_s": 7, "5": [7, 8, 9], "ignore_pattern": 7, "num_permut": 7, "256": 7, "jaccard_threshold": 7, "closedunitinterv": [7, 8, 9, 10], "7": [7, 9], "num_band": 7, "num_rows_per_band": 7, "tokenizer_model": 7, "minhashlsh": 7, "simhash": 7, "minhash": 7, "byte": [7, 8], "so": [7, 8, 9], "thei": 7, "won": 7, "kept": [7, 8], "final": [7, 9], "should": [7, 8, 9], "punctuat": [7, 9], "sentencepiec": 7, "english": [7, 8, 9], "recommend": [7, 9], "pleas": 7, "provid": [7, 9], "shingl": 7, "ignor": [7, 9], "string": [7, 8, 9], "pattern": [7, 9], "permut": 7, "min": [7, 8, 9], "jaccard": 7, "similar": [7, 8, 9], "threshold": [7, 8, 9], "detect": [7, 8, 9], "regard": 7, "onli": [7, 8, 9], "band": 7, "lsh": 7, "determin": [7, 10], "optim": 7, "minim": 7, "sum": 7, "prob": 7, "posit": [7, 8, 9], "neg": [7, 9], "row": 7, "rayimagededupl": [7, 13], "phash": 7, "raydocumentdedupl": [7, 13], "ignore_non_charact": 7, "alphabet": [7, 8, 9], "whitespac": [7, 9], "digit": 7, "documentdedupl": [7, 13], "md5": 7, "imagededupl": [7, 13], "documentsimhashdedupl": [7, 13], "6": [7, 8], "num_block": 7, "hamming_dist": 7, "4": [7, 8, 9], "And": 7, "block": 7, "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8, 9, 10], "rayvideodedupl": [7, 13], "imagetextsimilarityfilt": [8, 13], "hf_clip": 8, "openai": 8, "clip": [8, 9], "vit": 8, "patch32": 8, "min_scor": 8, "max_scor": 8, "horizontal_flip": [8, 9], "vertical_flip": [8, 9], "any_or_al": [8, 9], "ani": [8, 9], "reduce_mod": 8, "avg": 8, "those": 8, "within": [8, 9, 10], "rang": [8, 9, 10], "flip": [8, 9], "horizont": [8, 9], "left": [8, 9], "right": [8, 9], "vertic": [8, 9], "top": [8, 9, 10], "bottom": [8, 9], "strategi": [8, 9], "meet": [8, 9], "condit": [8, 9], "reduc": [8, 9], "mode": [8, 9], "correspond": [8, 10], "chunk": 8, "take": 8, "averag": 8, "rank": [8, 9, 10], "videoaspectratiofilt": [8, 13], "min_ratio": [8, 9], "9": [8, 9], "21": [8, 9], "max_ratio": [8, 9], "aspect": [8, 9], "aspectratio": [8, 9], "w": [8, 9], "minimum": [8, 9], "support": [8, 9], "imagetextmatchingfilt": [8, 13], "hf_blip": 8, "salesforc": [8, 9], "blip": [8, 9], "itm": 8, "coco": 8, "003": 8, "score": 8, "imagensfwfilt": [8, 13], "hf_nsfw_model": 8, "falconsai": 8, "nsfw_image_detect": 8, "score_threshold": 8, "have": 8, "low": 8, "nsfw": 8, "tokennumfilt": [8, 13], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "9223372036854775807": [8, 9], "total": [8, 9], "hug": 8, "face": [8, 9], "below": [8, 9], "exce": [8, 9], "textlengthfilt": [8, 13], "min_len": [8, 9], "max_len": [8, 9], "length": [8, 9], "specifiednumericfieldfilt": [8, 13], "field_kei": [8, 10], "min_valu": 8, "max_valu": 8, "numer": 8, "multi": [8, 10, 13], "specifiednumericfield": 8, "audionmfsnrfilt": [8, 13], "min_snr": 8, "max_snr": 8, "nmf_iter_num": 8, "500": [8, 9], "snr": 8, "nmf": 8, "db": 8, "sy": 8, "maxsiz": 8, "iter": [8, 9], "videoaestheticsfilt": [8, 13], "hf_scorer_model": 8, "frame_sampling_method": [8, 9], "frame_num": [8, 9], "3": [8, 9], "aesthet": 8, "frame": [8, 9], "predictor": 8, "By": 8, "shunk031": 8, "v2": 8, "sac": 8, "logo": 8, "ava1": 8, "l14": 8, "linearms": 8, "refer": [8, 9], "pypi": 8, "org": [8, 9], "project": 8, "simpl": [8, 9], "predict": 8, "all_keyfram": [8, 9], "former": [8, 9], "latter": [8, 9], "uniformli": [8, 9], "keyfram": 8, "larg": 8, "while": 8, "usual": 8, "small": 8, "term": 8, "middl": [8, 9], "last": [8, 9], "larger": [8, 9, 10], "addit": [8, 9], "durat": [8, 9], "must": [8, 9], "keyword": [8, 9], "perplexityfilt": [8, 13], "lang": [8, 9], "max_ppl": 8, "positivefloat": 8, "1500": 8, "perplex": 8, "phrasegroundingrecallfilt": [8, 13], "hf_owlvit": 8, "googl": 8, "owlvit": 8, "min_recal": 8, "max_recal": 8, "iou_thr": 8, "large_area_ratio_thr": 8, "95": 8, "conf_thr": 8, "locat": [8, 9], "recal": 8, "phrase": 8, "owl": 8, "ground": 8, "iou": 8, "nm": 8, "post": 8, "bbox": 8, "overlap": 8, "confid": 8, "area": 8, "out": 8, "account": 8, "more": [8, 9, 13], "maximumlinelengthfilt": [8, 13], "averagelinelengthfilt": [8, 13], "specifiedfieldfilt": [8, 13], "target_valu": 8, "retain": [8, 9], "videotaggingfromframesfilt": [8, 13], "peopl": 8, "shift": 8, "found": [8, 9], "http": [8, 9], "github": 8, "com": 8, "xinyu1205": 8, "recogn": 8, "anyth": 8, "blob": 8, "main": [8, 9], "ram": 8, "ram_tag_list": 8, "noqa": 8, "e501": 8, "requir": 8, "equal": [8, 9, 10], "depend": [8, 9], "textentitydependencyfilt": [8, 13], "min_dependency_num": 8, "identifi": [8, 9], "entiti": 8, "omit": 8, "zh": 8, "mini_dependency_num": 8, "edg": 8, "objet": 8, "videoresolutionfilt": [8, 13], "min_width": [8, 9], "max_width": [8, 9], "min_height": [8, 9], "max_height": [8, 9], "resolut": [8, 9], "alphanumericfilt": [8, 13], "25": 8, "count": 8, "alphanumer": 8, "imagewatermarkfilt": [8, 13], "hf_watermark_model": 8, "amrul": 8, "hzz": 8, "watermark_detector": 8, "prob_threshold": 8, "8": [8, 9], "watermark": [8, 9], "high": 8, "probabl": [8, 9], "imageaestheticsfilt": [8, 13], "audiosizefilt": [8, 13], "min_siz": 8, "max_siz": 8, "1tb": 8, "kb": 8, "mb": 8, "constraint": 8, "approxim": 8, "un": 8, "limit": 8, "stopwordsfilt": [8, 13], "stopwords_dir": 8, "home": 8, "runner": 8, "asset": 8, "what": 8, "adopt": 8, "avail": 8, "join": 8, "characterrepetitionfilt": [8, 13], "rep_len": 8, "gram": 8, "repetit": 8, "imageshapefilt": [8, 13], "shape": 8, "width": [8, 9], "height": [8, 9], "videodurationfilt": [8, 13], "min_dur": 8, "nonnegativefloat": [8, 9], "max_dur": 8, "second": [8, 9], "textactionfilt": [8, 13], "min_action_num": 8, "action": 8, "mini_action_num": 8, "videoocrarearatiofilt": [8, 13], "min_area_ratio": 8, "max_area_ratio": 8, "frame_sample_num": 8, "languages_to_detect": 8, "ch_sim": 8, "ocr": [8, 9], "evenli": 8, "full": [8, 9], "here": [8, 9, 13], "www": 8, "jaid": 8, "ai": [8, 9], "easyocr": 8, "get_read": 8, "videonsfwfilt": [8, 13], "specialcharactersfilt": [8, 13], "videoframestextsimilarityfilt": [8, 13], "kind": [8, 9], "relat": 8, "exampl": 8, "chineseclip": 8, "might": [8, 9], "choic": 8, "imageaspectratiofilt": [8, 13], "333": 8, "audiodurationfilt": [8, 13], "nonnegativeint": [8, 9], "languageidscorefilt": [8, 13], "identif": 8, "suffixfilt": [8, 13], "imagesizefilt": [8, 13], "videowatermarkfilt": [8, 13], "wordsnumfilt": [8, 13], "imagefaceratiofilt": [8, 13], "largest": [8, 10], "flaggedwordfilt": [8, 13], "045": 8, "flagged_words_dir": 8, "flag": 8, "flagged_word": 8, "wordrepetitionfilt": [8, 13], "videomotionscorefilt": [8, 13], "7976931348623157e": 8, "308": 8, "sampling_fp": 8, "sequenc": [8, 9], "rel": 8, "motion": 8, "farneback": 8, "algorith": 8, "opencv": 8, "dens": 8, "optic": 8, "flow": 8, "rate": 8, "frames_per_second": 8, "resiz": [8, 9], "smaller": [8, 9, 10], "rescal": 8, "allow": [8, 9], "longer": 8, "greater": [8, 9, 10], "being": [8, 9], "overrul": 8, "As": 8, "mai": 8, "shorter": [8, 9], "magnitud": 8, "normal": [8, 9], "diagon": 8, "videocaptioningfromaudiomapp": [9, 13], "keep_original_sampl": 9, "caption": 9, "stream": 9, "qwen": 9, "videotaggingfromaudiomapp": [9, 13], "hf_ast": 9, "mit": 9, "ast": 9, "finetun": 9, "audioset": 9, "4593": 9, "spectrogram": 9, "transform": 9, "imagecaptioningfromgpt4vmapp": [9, 13], "descript": 9, "api_kei": 9, "max_token": 9, "temperatur": 9, "system_prompt": 9, "user_prompt": 9, "user_prompt_kei": 9, "gpt": 9, "visison": 9, "reson": 9, "convers": 9, "custom": 9, "api": 9, "authent": 9, "request": 9, "control": 9, "output": 9, "prompt": 9, "guidanc": [9, 13], "rule": [9, 10], "gpt4": 9, "vision": 9, "respons": 9, "guid": 9, "uers_prompt_kei": 9, "punctuationnormalizationmapp": [9, 13], "unicod": 9, "removebibliographymapp": [9, 13], "bibliographi": 9, "end": 9, "latex": 9, "sentencesplitmapp": [9, 13], "videosplitbyscenemapp": [9, 13], "detector": 9, "contentdetector": 9, "27": 9, "min_scene_len": 9, "15": 9, "show_progress": 9, "cut": 9, "scene": 9, "avaliable_detector": 9, "adaptivedetector": 9, "window_width": 9, "min_content_v": 9, "luma_onli": 9, "kernel_s": 9, "video_manag": 9, "min_delta_hsv": 9, "thresholddetector": 9, "fade_bia": 9, "add_final_scen": 9, "block_siz": 9, "scenedetect": 9, "pass": 9, "progress": 9, "cleanipmapp": [9, 13], "repl": 9, "clean": 9, "ipv4": 9, "ipv6": 9, "address": 9, "regular": 9, "express": 9, "search": [9, 13], "replac": 9, "cleanlinksmapp": [9, 13], "ftp": 9, "removeheadermapp": [9, 13], "drop_no_head": 9, "header": 9, "begin": 9, "drop": 9, "removetabletextmapp": [9, 13], "min_col": 9, "from_2_to_20": 9, "max_col": 9, "20": 9, "videoremovewatermarkmapp": [9, 13], "roi_str": 9, "roi_typ": 9, "roi_kei": 9, "min_frame_threshold": 9, "detection_method": 9, "pixel_valu": 9, "region": 9, "x1": 9, "y1": 9, "x2": 9, "y2": 9, "roi": 9, "pixel": 9, "corner": 9, "coordin": 9, "wight": 9, "coodin": 9, "pixel_divers": 9, "useless": 9, "removerepeatsentencesmapp": [9, 13], "ignore_special_charact": 9, "min_repeat_sentence_length": 9, "repeat": 9, "judg": 9, "except": 9, "letter": 9, "imagediffusionmapp": [9, 13], "hf_diffus": 9, "compvi": 9, "stabl": 9, "diffus": 9, "v1": 9, "torch_dtyp": 9, "fp32": 9, "revis": 9, "strength": 9, "guidance_scal": 9, "aug_num": 9, "caption_kei": 9, "hf_img2seq": 9, "blip2": 9, "opt": 9, "7b": 9, "point": 9, "fp16": 9, "bf16": 9, "version": 9, "branch": 9, "commit": 9, "id": 9, "git": 9, "extent": 9, "start": 9, "nois": 9, "higher": 9, "denois": 9, "step": 9, "amount": 9, "num_inference_step": 9, "essenti": 9, "scale": 9, "encourag": 9, "close": 9, "expens": 9, "qualiti": 9, "enabl": 9, "produc": 9, "keep_candidate_mod": 9, "caption_num": 9, "candid": 9, "random_ani": 9, "similar_one_simhash": 9, "batched_op": 9, "both": [9, 10], "suppos": 9, "batch": 9, "b": 9, "denot": 9, "2nb": 9, "nb": 9, "mnb": 9, "otherwis": 9, "imagefaceblurmapp": [9, 13], "blur_typ": 9, "gaussian": 9, "radiu": 9, "blur": 9, "kernel": 9, "videoffmpegwrappedmapp": [9, 13], "filter_nam": 9, "filter_kwarg": 9, "global_arg": 9, "capture_stderr": 9, "overwrite_output": 9, "wrapper": 9, "ffmpeg": 9, "captur": 9, "stderr": 9, "chineseconvertmapp": [9, 13], "s2t": 9, "tradit": 9, "simplifi": 9, "japanes": 9, "kanji": 9, "choos": 9, "t2": 9, "s2tw": 9, "taiwan": 9, "standard": 9, "tw2": 9, "s2hk": 9, "hong": 9, "kong": 9, "variant": 9, "hk2": 9, "s2twp": 9, "taiwanes": 9, "idiom": 9, "tw2sp": 9, "mainland": 9, "t2tw": 9, "tw2t": 9, "hk2t": 9, "t2hk": 9, "t2jp": 9, "ky\u016bjitai": 9, "jp2t": 9, "shinjitai": 9, "nlpcdazhmapp": [9, 13], "sequenti": 9, "replace_similar_word": 9, "replace_homophone_char": 9, "delete_random_char": 9, "swap_random_char": 9, "replace_equivalent_num": 9, "simpli": 9, "nlpcda": 9, "librari": 9, "you": 9, "time": 9, "semant": 9, "significantli": 9, "notic": 9, "combin": 9, "would": 9, "opened_aug_method": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "delet": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "swap": 9, "contigu": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "imageblurmapp": [9, 13], "p": 9, "blure": 9, "cleancopyrightmapp": [9, 13], "copyright": 9, "comment": 9, "removenonchinesecharacterlmapp": [9, 13], "keep_alphabet": 9, "keep_numb": 9, "keep_punc": 9, "videosplitbykeyframemapp": [9, 13], "get_split_key_fram": 9, "removespecificcharsmapp": [9, 13], "chars_to_remov": 9, "videoresizeaspectratiomapp": [9, 13], "increas": 9, "decreas": 9, "enforc": 9, "abov": 9, "adjust": 9, "dimens": 9, "either": 9, "enlarg": 9, "accept": 9, "cleanhtmlmapp": [9, 13], "whitespacenormalizationmapp": [9, 13], "0x20": 9, "wikipedia": 9, "wiki": 9, "whitespace_charact": 9, "videotaggingfromframesmapp": [9, 13], "removecommentsmapp": [9, 13], "doc_typ": 9, "inlin": 9, "multilin": 9, "expandmacromapp": [9, 13], "expand": 9, "macro": 9, "definit": 9, "bodi": 9, "extractqamapp": [9, 13], "hf_model": 9, "alibaba": 9, "pai": 9, "qwen1_5": 9, "doc2qa": 9, "qa_format": 9, "chatml": 9, "question": 9, "answer": 9, "llama3": 9, "8b": 9, "baichuan2": 9, "4b": 9, "1b8": 9, "0b5": 9, "These": 9, "train": 9, "suitabl": 9, "hugginfac": 9, "interfac": 9, "follow": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "ulaanbaatar": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "reykjavik": 9, "human": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "assist": 9, "\u4f60\u597d": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "imagecaptioningmapp": [9, 13], "prompt_kei": 9, "anoth": 9, "how": 9, "mani": 9, "similar_on": 9, "removewordswithincorrectsubstringsmapp": [9, 13], "substr": 9, "incorrect": 9, "should_keep_word_with_incorrect_substr": 9, "videocaptioningfromvideomapp": [9, 13], "hf_video_blip": 9, "kpyu": 9, "ego4d": 9, "videocaptioningfromsummarizermapp": [9, 13], "hf_summar": 9, "consider_video_caption_from_video": 9, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "vid_cap_from_vid_arg": 9, "vid_cap_from_frm_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "keep_tag_num": 9, "summar": 9, "directli": 9, "too": 9, "bring": 9, "influenc": 9, "frequent": 9, "fixunicodemapp": [9, 13], "fix": 9, "error": 9, "form": 9, "nfc": 9, "nfkc": 9, "nfd": 9, "nfkd": 9, "nlpaugenmapp": [9, 13], "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "insert_random_char": 9, "nlpaug": 9, "love": 9, "llm": 9, "simul": 9, "spell": 9, "ll": 9, "keyboard": 9, "ov4": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "videocaptioningfromframesmapp": [9, 13], "removelongwordsmapp": [9, 13], "long": 9, "should_keep_long_word": 9, "videoresizeresolutionmapp": [9, 13], "force_original_aspect_ratio": 9, "disabl": 9, "force_divisible_bi": 9, "leav": 9, "super": 9, "deep": 9, "learn": 9, "futur": 9, "necessari": 9, "ensur": 9, "divis": 9, "integ": 9, "even": 9, "cleanemailmapp": [9, 13], "email": 9, "replacecontentmapp": [9, 13], "design": 9, "audioffmpegwrappedmapp": [9, 13], "videosplitbydurationmapp": [9, 13], "split_dur": 9, "min_last_split_dur": 9, "discard": 9, "split_videos_by_dur": 9, "videofaceblurmapp": [9, 13], "frequencyspecifiedfieldselector": [10, 13], "top_ratio": 10, "topk": 10, "sort": 10, "frequenc": 10, "descend": 10, "randomselector": [10, 13], "select_ratio": 10, "select_num": 10, "rangespecifiedfieldselector": [10, 13], "lower_percentil": 10, "upper_percentil": 10, "lower_rank": 10, "upper_rank": 10, "smallest": 10, "bound": 10, "upper": 10, "topkspecifiedfieldselector": [10, 13], "give": 13, "kdd": 13, "24": 13, "modal": 13, "foundat": 13, "practic": 13, "see": 13, "detail": 13, "data_juic": 13, "core": 13, "index": 13, "page": 13}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [3, 0, 0, "-", "core"], [0, 3, 1, "", "cuda_device_count"], [4, 0, 0, "-", "format"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"]], "data_juicer.core": [[3, 1, 1, "", "Analyzer"], [3, 1, 1, "", "Executor"], [3, 1, 1, "", "Exporter"], [3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "Tracer"]], "data_juicer.core.Analyzer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"], [3, 2, 1, "", "sample_data"]], "data_juicer.core.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "export_compute_stats"], [3, 2, 1, "", "to_json"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "map"], [3, 2, 1, "", "process"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 1, 1, "", "CsvFormatter"], [4, 1, 1, "", "JsonFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "MixtureFormatter"], [4, 1, 1, "", "ParquetFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 1, 1, "", "TextFormatter"], [4, 1, 1, "", "TsvFormatter"], [4, 3, 1, "", "load_formatter"]], "data_juicer.format.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 2, 1, "", "random_sample"]], "data_juicer.format.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "ExtractQAMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ExtractQAMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "RandomSelector"], [10, 1, 1, "", "RangeSpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RandomSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RangeSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"]}, "titleterms": {"data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "analysi": 1, "config": 2, "core": 3, "format": 4, "op": [5, 6, 7, 8, 9, 10], "common": 6, "dedupl": 7, "filter": 8, "mapper": 9, "selector": 10, "tool": 11, "util": 12, "welcom": 13, "data": 13, "juicer": 13, "": 13, "document": 13, "tutori": 13, "api": 13, "refer": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"data_juicer": [[0, "module-data_juicer"], [14, "data-juicer"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "Tutorial": [[13, "tutorial"]], "API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]]}, "indexentries": {"cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count"]], "data_juicer": [[0, "module-data_juicer"]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available"]], "module": [[0, "module-data_juicer"], [1, "module-data_juicer.analysis"], [2, "module-data_juicer.config"], [3, "module-data_juicer.core"], [4, "module-data_juicer.format"], [5, "module-data_juicer.ops"], [6, "module-data_juicer.ops.common"], [7, "module-data_juicer.ops.deduplicator"], [8, "module-data_juicer.ops.filter"], [9, "module-data_juicer.ops.mapper"], [10, "module-data_juicer.ops.selector"], [11, "module-data_juicer.tools"], [12, "module-data_juicer.utils"]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__"]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze"]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze"]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze"]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist"]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config"]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs"]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config"]], "analyzer (class in data_juicer.core)": [[3, "data_juicer.core.Analyzer"]], "executor (class in data_juicer.core)": [[3, "data_juicer.core.Executor"]], "exporter (class in data_juicer.core)": [[3, "data_juicer.core.Exporter"]], "gib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.GiB"]], "kib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.KiB"]], "mib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.MiB"]], "nesteddataset (class in data_juicer.core)": [[3, "data_juicer.core.NestedDataset"]], "tib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.TiB"]], "tracer (class in data_juicer.core)": [[3, "data_juicer.core.Tracer"]], "__init__() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.__init__"]], "__init__() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.__init__"]], "__init__() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.__init__"]], "__init__() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.__init__"]], "__init__() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.__init__"]], "add_column() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.add_column"]], "cleanup_cache_files() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.cleanup_cache_files"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "export() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export"]], "export_compute_stats() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export_compute_stats"]], "filter() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.filter"]], "from_dict() (data_juicer.core.nesteddataset class method)": [[3, "data_juicer.core.NestedDataset.from_dict"]], "map() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.map"]], "process() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.process"]], "remove_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.run"]], "run() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.run"]], "sample_data() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.sample_data"]], "select() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select"]], "select_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select_columns"]], "to_json() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_json"]], "to_jsonl() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_mapper"]], "csvformatter (class in data_juicer.format)": [[4, "data_juicer.format.CsvFormatter"]], "jsonformatter (class in data_juicer.format)": [[4, "data_juicer.format.JsonFormatter"]], "localformatter (class in data_juicer.format)": [[4, "data_juicer.format.LocalFormatter"]], "mixtureformatter (class in data_juicer.format)": [[4, "data_juicer.format.MixtureFormatter"]], "parquetformatter (class in data_juicer.format)": [[4, "data_juicer.format.ParquetFormatter"]], "remoteformatter (class in data_juicer.format)": [[4, "data_juicer.format.RemoteFormatter"]], "suffixes (data_juicer.format.csvformatter attribute)": [[4, "data_juicer.format.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.jsonformatter attribute)": [[4, "data_juicer.format.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquetformatter attribute)": [[4, "data_juicer.format.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.textformatter attribute)": [[4, "data_juicer.format.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsvformatter attribute)": [[4, "data_juicer.format.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format)": [[4, "data_juicer.format.TextFormatter"]], "tsvformatter (class in data_juicer.format)": [[4, "data_juicer.format.TsvFormatter"]], "__init__() (data_juicer.format.csvformatter method)": [[4, "data_juicer.format.CsvFormatter.__init__"]], "__init__() (data_juicer.format.jsonformatter method)": [[4, "data_juicer.format.JsonFormatter.__init__"]], "__init__() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.__init__"]], "__init__() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.__init__"]], "__init__() (data_juicer.format.parquetformatter method)": [[4, "data_juicer.format.ParquetFormatter.__init__"]], "__init__() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.__init__"]], "__init__() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.__init__"]], "__init__() (data_juicer.format.tsvformatter method)": [[4, "data_juicer.format.TsvFormatter.__init__"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "load_dataset() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format)": [[4, "data_juicer.format.load_formatter"]], "random_sample() (data_juicer.format.mixtureformatter class method)": [[4, "data_juicer.format.MixtureFormatter.random_sample"]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator"]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter"]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper"]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector"]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__"]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__"]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__"]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__"]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops"]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process"]], "process() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process"]], "process() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process"]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process"]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run"]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run"]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run"]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace"]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip"]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator"]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE"]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator"]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator"]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator"]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator"]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator"]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__"]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process"]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter"]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter"]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter"]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter"]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter"]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter"]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter"]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter"]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter"]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter"]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter"]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter"]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter"]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter"]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter"]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter"]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter"]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter"]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter"]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter"]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter"]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter"]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter"]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter"]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter"]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter"]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter"]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter"]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter"]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__"]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__"]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__"]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__"]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__"]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__"]], "compute_stats() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader"]], "process() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process"]], "process() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process"]], "process() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process"]], "process() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process"]], "process() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process"]], "process() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process"]], "process() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process"]], "process() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process"]], "process() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process"]], "process() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process"]], "process() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process"]], "process() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process"]], "process() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process"]], "process() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process"]], "process() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process"]], "process() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process"]], "process() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process"]], "process() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process"]], "process() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process"]], "process() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process"]], "process() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process"]], "process() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process"]], "process() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process"]], "process() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process"]], "process() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process"]], "process() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process"]], "process() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process"]], "process() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process"]], "process() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process"]], "process() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process"]], "process() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process"]], "process() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process"]], "process() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process"]], "process() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process"]], "process() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process"]], "process() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process"]], "process() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process"]], "process() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process"]], "process() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process"]], "process() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process"]], "process() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process"]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper"]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper"]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper"]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper"]], "extractqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractQAMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper"]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper"]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper"]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper"]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper"]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper"]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper"]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper"]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY"]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper"]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper"]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper"]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper"]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper"]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper"]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper"]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper"]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper"]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper"]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper"]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper"]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper"]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper"]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__"]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame"]], "process() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process"]], "process() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process"]], "process() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process"]], "process() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process"]], "process() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process"]], "process() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process"]], "process() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process"]], "process() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process"]], "process() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.process"]], "process() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process"]], "process() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process"]], "process() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process"]], "process() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process"]], "process() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process"]], "process() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process"]], "process() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process"]], "process() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process"]], "process() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process"]], "process() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process"]], "process() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process"]], "process() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process"]], "process() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process"]], "process() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process"]], "process() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process"]], "process() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process"]], "process() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process"]], "process() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process"]], "process() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process"]], "process() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process"]], "process() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process"]], "process() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process"]], "process() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process"]], "process() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process"]], "process() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process"]], "process() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process"]], "process() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process"]], "process() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process"]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector"]], "randomselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RandomSelector"]], "rangespecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.__init__"]], "__init__() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.process"]], "process() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]]}})
\ No newline at end of file