Skip to content

Commit

Permalink
deploy: 8d53f23
Browse files Browse the repository at this point in the history
  • Loading branch information
yxdyc committed Aug 1, 2024
1 parent 8ec8287 commit 678d665
Show file tree
Hide file tree
Showing 20 changed files with 66 additions and 38 deletions.
2 changes: 2 additions & 0 deletions _modules/data_juicer/ops/base_op.html
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ <h1>Source code for data_juicer.ops.base_op</h1><div class="highlight"><pre>
<span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span>
<span class="sa">f</span><span class="s1">&#39;An error occurred in mapper operation when processing &#39;</span>
<span class="sa">f</span><span class="s1">&#39;samples </span><span class="si">{</span><span class="n">samples</span><span class="si">}</span><span class="s1">, </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="n">e</span><span class="p">)</span><span class="si">}</span><span class="s1">: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="n">traceback</span><span class="o">.</span><span class="n">print_exc</span><span class="p">()</span>
<span class="n">ret</span> <span class="o">=</span> <span class="p">{</span><span class="n">key</span><span class="p">:</span> <span class="p">[]</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">samples</span><span class="o">.</span><span class="n">keys</span><span class="p">()}</span>
<span class="n">ret</span><span class="p">[</span><span class="n">Fields</span><span class="o">.</span><span class="n">stats</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">ret</span><span class="p">[</span><span class="n">Fields</span><span class="o">.</span><span class="n">source_file</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
Expand Down Expand Up @@ -181,6 +182,7 @@ <h1>Source code for data_juicer.ops.base_op</h1><div class="highlight"><pre>
<span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span>
<span class="sa">f</span><span class="s1">&#39;An error occurred in mapper operation when processing &#39;</span>
<span class="sa">f</span><span class="s1">&#39;sample </span><span class="si">{</span><span class="n">sample</span><span class="si">}</span><span class="s1">, </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="n">e</span><span class="p">)</span><span class="si">}</span><span class="s1">: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="n">traceback</span><span class="o">.</span><span class="n">print_exc</span><span class="p">()</span>
<span class="n">ret</span> <span class="o">=</span> <span class="p">{</span><span class="n">key</span><span class="p">:</span> <span class="p">[]</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">sample</span><span class="o">.</span><span class="n">keys</span><span class="p">()}</span>
<span class="n">ret</span><span class="p">[</span><span class="n">Fields</span><span class="o">.</span><span class="n">stats</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">ret</span><span class="p">[</span><span class="n">Fields</span><span class="o">.</span><span class="n">source_file</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ <h1>Source code for data_juicer.ops.deduplicator.ray_video_deduplicator</h1><div

<span class="kn">from</span> <span class="nn">jsonargparse.typing</span> <span class="kn">import</span> <span class="n">PositiveInt</span>

<span class="kn">from</span> <span class="nn">data_juicer.utils.mm_utils</span> <span class="kn">import</span> <span class="n">load_data_with_context</span><span class="p">,</span> <span class="n">load_video</span>
<span class="kn">from</span> <span class="nn">data_juicer.utils.mm_utils</span> <span class="kn">import</span> <span class="p">(</span><span class="n">close_video</span><span class="p">,</span> <span class="n">load_data_with_context</span><span class="p">,</span>
<span class="n">load_video</span><span class="p">)</span>

<span class="kn">from</span> <span class="nn">..base_op</span> <span class="kn">import</span> <span class="n">OPERATORS</span>
<span class="kn">from</span> <span class="nn">..op_fusion</span> <span class="kn">import</span> <span class="n">LOADED_VIDEOS</span>
Expand Down Expand Up @@ -136,6 +137,9 @@ <h1>Source code for data_juicer.ops.deduplicator.ray_video_deduplicator</h1><div
<span class="k">if</span> <span class="n">packet</span><span class="o">.</span><span class="n">stream</span><span class="o">.</span><span class="n">type</span> <span class="o">==</span> <span class="s1">&#39;video&#39;</span><span class="p">:</span>
<span class="n">md5_hash</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="nb">bytes</span><span class="p">(</span><span class="n">packet</span><span class="p">))</span>

<span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">videos</span><span class="p">:</span>
<span class="n">close_video</span><span class="p">(</span><span class="n">videos</span><span class="p">[</span><span class="n">key</span><span class="p">])</span>

<span class="k">return</span> <span class="n">md5_hash</span><span class="o">.</span><span class="n">hexdigest</span><span class="p">()</span></div></div>
</pre></div>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ <h1>Source code for data_juicer.ops.deduplicator.video_deduplicator</h1><div cla
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">Set</span><span class="p">,</span> <span class="n">Tuple</span>

<span class="kn">from</span> <span class="nn">data_juicer.utils.constant</span> <span class="kn">import</span> <span class="n">HashKeys</span>
<span class="kn">from</span> <span class="nn">data_juicer.utils.mm_utils</span> <span class="kn">import</span> <span class="n">load_data_with_context</span><span class="p">,</span> <span class="n">load_video</span>
<span class="kn">from</span> <span class="nn">data_juicer.utils.mm_utils</span> <span class="kn">import</span> <span class="p">(</span><span class="n">close_video</span><span class="p">,</span> <span class="n">load_data_with_context</span><span class="p">,</span>
<span class="n">load_video</span><span class="p">)</span>

<span class="kn">from</span> <span class="nn">..base_op</span> <span class="kn">import</span> <span class="n">OPERATORS</span><span class="p">,</span> <span class="n">Deduplicator</span>
<span class="kn">from</span> <span class="nn">..op_fusion</span> <span class="kn">import</span> <span class="n">LOADED_VIDEOS</span>
Expand Down Expand Up @@ -145,6 +146,9 @@ <h1>Source code for data_juicer.ops.deduplicator.video_deduplicator</h1><div cla
<span class="k">if</span> <span class="n">packet</span><span class="o">.</span><span class="n">stream</span><span class="o">.</span><span class="n">type</span> <span class="o">==</span> <span class="s1">&#39;video&#39;</span><span class="p">:</span>
<span class="n">md5_hash</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="nb">bytes</span><span class="p">(</span><span class="n">packet</span><span class="p">))</span>

<span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">videos</span><span class="p">:</span>
<span class="n">close_video</span><span class="p">(</span><span class="n">videos</span><span class="p">[</span><span class="n">key</span><span class="p">])</span>

<span class="n">sample</span><span class="p">[</span><span class="n">HashKeys</span><span class="o">.</span><span class="n">videohash</span><span class="p">]</span> <span class="o">=</span> <span class="n">md5_hash</span><span class="o">.</span><span class="n">hexdigest</span><span class="p">()</span>
<span class="k">return</span> <span class="n">sample</span></div>

Expand Down
4 changes: 2 additions & 2 deletions _modules/data_juicer/ops/filter/video_aesthetics_filter.html
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ <h1>Source code for data_juicer.ops.filter.video_aesthetics_filter</h1><div clas

<span class="kn">from</span> <span class="nn">data_juicer.utils.availability_utils</span> <span class="kn">import</span> <span class="n">AvailabilityChecking</span>
<span class="kn">from</span> <span class="nn">data_juicer.utils.constant</span> <span class="kn">import</span> <span class="n">Fields</span><span class="p">,</span> <span class="n">StatsKeys</span>
<span class="kn">from</span> <span class="nn">data_juicer.utils.mm_utils</span> <span class="kn">import</span> <span class="p">(</span><span class="n">extract_key_frames</span><span class="p">,</span>
<span class="kn">from</span> <span class="nn">data_juicer.utils.mm_utils</span> <span class="kn">import</span> <span class="p">(</span><span class="n">close_video</span><span class="p">,</span> <span class="n">extract_key_frames</span><span class="p">,</span>
<span class="n">extract_video_frames_uniformly</span><span class="p">,</span>
<span class="n">load_data_with_context</span><span class="p">,</span> <span class="n">load_video</span><span class="p">)</span>

Expand Down Expand Up @@ -265,7 +265,7 @@ <h1>Source code for data_juicer.ops.filter.video_aesthetics_filter</h1><div clas

<span class="k">if</span> <span class="ow">not</span> <span class="n">context</span><span class="p">:</span>
<span class="k">for</span> <span class="n">vid_key</span> <span class="ow">in</span> <span class="n">videos</span><span class="p">:</span>
<span class="n">videos</span><span class="p">[</span><span class="n">vid_key</span><span class="p">]</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="n">close_video</span><span class="p">(</span><span class="n">videos</span><span class="p">[</span><span class="n">vid_key</span><span class="p">])</span>

<span class="k">return</span> <span class="n">sample</span></div>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ <h1>Source code for data_juicer.ops.filter.video_aspect_ratio_filter</h1><div cl
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>

<span class="kn">from</span> <span class="nn">data_juicer.utils.constant</span> <span class="kn">import</span> <span class="n">Fields</span><span class="p">,</span> <span class="n">StatsKeys</span>
<span class="kn">from</span> <span class="nn">data_juicer.utils.mm_utils</span> <span class="kn">import</span> <span class="n">load_data_with_context</span><span class="p">,</span> <span class="n">load_video</span>
<span class="kn">from</span> <span class="nn">data_juicer.utils.mm_utils</span> <span class="kn">import</span> <span class="p">(</span><span class="n">close_video</span><span class="p">,</span> <span class="n">load_data_with_context</span><span class="p">,</span>
<span class="n">load_video</span><span class="p">)</span>

<span class="kn">from</span> <span class="nn">..base_op</span> <span class="kn">import</span> <span class="n">OPERATORS</span><span class="p">,</span> <span class="n">Filter</span>
<span class="kn">from</span> <span class="nn">..op_fusion</span> <span class="kn">import</span> <span class="n">LOADED_VIDEOS</span>
Expand Down Expand Up @@ -151,7 +152,7 @@ <h1>Source code for data_juicer.ops.filter.video_aspect_ratio_filter</h1><div cl
<span class="n">video_aspect_ratios</span><span class="p">[</span>
<span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">stream</span><span class="o">.</span><span class="n">codec_context</span><span class="o">.</span><span class="n">width</span> <span class="o">/</span> <span class="n">stream</span><span class="o">.</span><span class="n">codec_context</span><span class="o">.</span><span class="n">height</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">context</span><span class="p">:</span>
<span class="n">video</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="n">close_video</span><span class="p">(</span><span class="n">video</span><span class="p">)</span>

<span class="n">sample</span><span class="p">[</span><span class="n">Fields</span><span class="o">.</span><span class="n">stats</span><span class="p">][</span><span class="n">StatsKeys</span><span class="o">.</span><span class="n">video_aspect_ratios</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">video_aspect_ratios</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">loaded_video_keys</span>
Expand Down
5 changes: 3 additions & 2 deletions _modules/data_juicer/ops/filter/video_duration_filter.html
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ <h1>Source code for data_juicer.ops.filter.video_duration_filter</h1><div class=
<span class="kn">from</span> <span class="nn">jsonargparse.typing</span> <span class="kn">import</span> <span class="n">NonNegativeInt</span>

<span class="kn">from</span> <span class="nn">data_juicer.utils.constant</span> <span class="kn">import</span> <span class="n">Fields</span><span class="p">,</span> <span class="n">StatsKeys</span>
<span class="kn">from</span> <span class="nn">data_juicer.utils.mm_utils</span> <span class="kn">import</span> <span class="n">load_data_with_context</span><span class="p">,</span> <span class="n">load_video</span>
<span class="kn">from</span> <span class="nn">data_juicer.utils.mm_utils</span> <span class="kn">import</span> <span class="p">(</span><span class="n">close_video</span><span class="p">,</span> <span class="n">load_data_with_context</span><span class="p">,</span>
<span class="n">load_video</span><span class="p">)</span>

<span class="kn">from</span> <span class="nn">..base_op</span> <span class="kn">import</span> <span class="n">OPERATORS</span><span class="p">,</span> <span class="n">Filter</span>
<span class="kn">from</span> <span class="nn">..op_fusion</span> <span class="kn">import</span> <span class="n">LOADED_VIDEOS</span>
Expand Down Expand Up @@ -152,7 +153,7 @@ <h1>Source code for data_juicer.ops.filter.video_duration_filter</h1><div class=
<span class="n">video_durations</span><span class="p">[</span><span class="n">video_key</span><span class="p">]</span> <span class="o">=</span> <span class="nb">round</span><span class="p">(</span><span class="n">stream</span><span class="o">.</span><span class="n">duration</span> <span class="o">*</span>
<span class="n">stream</span><span class="o">.</span><span class="n">time_base</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">context</span><span class="p">:</span>
<span class="n">video</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="n">close_video</span><span class="p">(</span><span class="n">video</span><span class="p">)</span>

<span class="c1"># get video durations</span>
<span class="n">sample</span><span class="p">[</span><span class="n">Fields</span><span class="o">.</span><span class="n">stats</span><span class="p">][</span><span class="n">StatsKeys</span><span class="o">.</span><span class="n">video_duration</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ <h1>Source code for data_juicer.ops.filter.video_frames_text_similarity_filter</

<span class="kn">from</span> <span class="nn">data_juicer.utils.availability_utils</span> <span class="kn">import</span> <span class="n">AvailabilityChecking</span>
<span class="kn">from</span> <span class="nn">data_juicer.utils.constant</span> <span class="kn">import</span> <span class="n">Fields</span><span class="p">,</span> <span class="n">StatsKeys</span>
<span class="kn">from</span> <span class="nn">data_juicer.utils.mm_utils</span> <span class="kn">import</span> <span class="p">(</span><span class="n">SpecialTokens</span><span class="p">,</span> <span class="n">extract_key_frames</span><span class="p">,</span>
<span class="kn">from</span> <span class="nn">data_juicer.utils.mm_utils</span> <span class="kn">import</span> <span class="p">(</span><span class="n">SpecialTokens</span><span class="p">,</span> <span class="n">close_video</span><span class="p">,</span>
<span class="n">extract_key_frames</span><span class="p">,</span>
<span class="n">extract_video_frames_uniformly</span><span class="p">,</span>
<span class="n">load_data_with_context</span><span class="p">,</span> <span class="n">load_video</span><span class="p">,</span>
<span class="n">remove_special_tokens</span><span class="p">)</span>
Expand Down Expand Up @@ -279,7 +280,7 @@ <h1>Source code for data_juicer.ops.filter.video_frames_text_similarity_filter</

<span class="k">if</span> <span class="ow">not</span> <span class="n">context</span><span class="p">:</span>
<span class="k">for</span> <span class="n">vid_key</span> <span class="ow">in</span> <span class="n">videos</span><span class="p">:</span>
<span class="n">videos</span><span class="p">[</span><span class="n">vid_key</span><span class="p">]</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="n">close_video</span><span class="p">(</span><span class="n">videos</span><span class="p">[</span><span class="n">vid_key</span><span class="p">])</span>

<span class="k">return</span> <span class="n">sample</span></div>

Expand Down
Loading

0 comments on commit 678d665

Please sign in to comment.