Skip to content

Commit

Permalink
deploy: 65d7c91
Browse files Browse the repository at this point in the history
  • Loading branch information
drcege committed Nov 5, 2024
1 parent 9ad580a commit e3b3a6d
Show file tree
Hide file tree
Showing 17 changed files with 3,506 additions and 3,189 deletions.
21 changes: 15 additions & 6 deletions _modules/data_juicer/core/data.html
Original file line number Diff line number Diff line change
Expand Up @@ -325,16 +325,23 @@ <h1>Source code for data_juicer.core.data</h1><div class="highlight"><pre>

<span class="k">if</span> <span class="n">inspect</span><span class="o">.</span><span class="n">ismethod</span><span class="p">(</span><span class="n">called_func</span><span class="p">):</span>
<span class="c1"># batched is required for fault-tolerant or batched OP</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">called_func</span><span class="o">.</span><span class="vm">__self__</span><span class="o">.</span><span class="n">turbo</span> <span class="ow">or</span> <span class="nb">hasattr</span><span class="p">(</span>
<span class="k">if</span> <span class="nb">callable</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span>
<span class="n">called_func</span><span class="o">.</span><span class="vm">__self__</span><span class="p">,</span>
<span class="s1">&#39;is_batched_op&#39;</span><span class="p">)</span> <span class="ow">and</span> <span class="n">called_func</span><span class="o">.</span><span class="vm">__self__</span><span class="o">.</span><span class="n">is_batched_op</span><span class="p">():</span>
<span class="s1">&#39;is_batched_op&#39;</span><span class="p">))</span> <span class="ow">and</span> <span class="n">called_func</span><span class="o">.</span><span class="vm">__self__</span><span class="o">.</span><span class="n">is_batched_op</span><span class="p">(</span>
<span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">called_func</span><span class="o">.</span><span class="vm">__self__</span><span class="o">.</span><span class="n">turbo</span><span class="p">:</span>
<span class="n">kargs</span><span class="p">[</span><span class="s1">&#39;batched&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">kargs</span><span class="p">[</span><span class="s1">&#39;batch_size&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">kargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;batch_size&#39;</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span>
<span class="n">called_func</span><span class="o">.</span><span class="vm">__self__</span><span class="p">,</span> <span class="s1">&#39;is_batched_op&#39;</span>
<span class="p">)</span> <span class="ow">and</span> <span class="n">called_func</span><span class="o">.</span><span class="vm">__self__</span><span class="o">.</span><span class="n">is_batched_op</span><span class="p">()</span> <span class="k">else</span> <span class="mi">1</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">kargs</span><span class="p">[</span><span class="s1">&#39;batched&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span>

<span class="c1"># rank is required for cuda model loading</span>
<span class="k">if</span> <span class="nb">callable</span><span class="p">(</span>
<span class="nb">getattr</span><span class="p">(</span><span class="n">called_func</span><span class="o">.</span><span class="vm">__self__</span><span class="p">,</span>
<span class="s1">&#39;use_cuda&#39;</span><span class="p">))</span> <span class="ow">and</span> <span class="n">called_func</span><span class="o">.</span><span class="vm">__self__</span><span class="o">.</span><span class="n">use_cuda</span><span class="p">():</span>
<span class="n">kargs</span><span class="p">[</span><span class="s1">&#39;with_rank&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>

<span class="k">if</span> <span class="s1">&#39;new_fingerprint&#39;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">kargs</span> <span class="ow">or</span> <span class="n">kargs</span><span class="p">[</span><span class="s1">&#39;new_fingerprint&#39;</span><span class="p">]</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">new_fingerprint</span> <span class="o">=</span> <span class="n">generate_fingerprint</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kargs</span><span class="p">)</span>
<span class="n">kargs</span><span class="p">[</span><span class="s1">&#39;new_fingerprint&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">new_fingerprint</span>
Expand Down Expand Up @@ -379,10 +386,12 @@ <h1>Source code for data_juicer.core.data</h1><div class="highlight"><pre>
<span class="n">called_func</span> <span class="o">=</span> <span class="n">called_func</span><span class="o">.</span><span class="n">__wrapped__</span>

<span class="c1"># Batched is always required for fault tolerance</span>
<span class="k">if</span> <span class="n">inspect</span><span class="o">.</span><span class="n">ismethod</span><span class="p">(</span>
<span class="n">called_func</span><span class="p">)</span> <span class="ow">and</span> <span class="n">called_func</span><span class="o">.</span><span class="vm">__self__</span><span class="o">.</span><span class="n">is_batched_op</span><span class="p">():</span>
<span class="n">kargs</span><span class="p">[</span><span class="s1">&#39;batched&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">kargs</span><span class="p">[</span><span class="s1">&#39;batch_size&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">kargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;batch_size&#39;</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="n">inspect</span><span class="o">.</span><span class="n">ismethod</span><span class="p">(</span><span class="n">called_func</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">callable</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span>
<span class="n">called_func</span><span class="o">.</span><span class="vm">__self__</span><span class="p">,</span>
<span class="s1">&#39;is_batched_op&#39;</span><span class="p">))</span> <span class="ow">and</span> <span class="n">called_func</span><span class="o">.</span><span class="vm">__self__</span><span class="o">.</span><span class="n">is_batched_op</span><span class="p">():</span>
<span class="n">kargs</span><span class="p">[</span><span class="s1">&#39;batched&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">kargs</span><span class="p">[</span><span class="s1">&#39;batch_size&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">kargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;batch_size&#39;</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>

<span class="k">if</span> <span class="s1">&#39;new_fingerprint&#39;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">kargs</span> <span class="ow">or</span> <span class="n">kargs</span><span class="p">[</span><span class="s1">&#39;new_fingerprint&#39;</span><span class="p">]</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">new_fingerprint</span> <span class="o">=</span> <span class="n">generate_fingerprint</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kargs</span><span class="p">)</span>
Expand Down
9 changes: 9 additions & 0 deletions _modules/data_juicer/ops/base_op.html
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ <h1>Source code for data_juicer.ops.base_op</h1><div class="highlight"><pre>
<span class="kn">import</span> <span class="nn">traceback</span>
<span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">wraps</span>

<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="kn">from</span> <span class="nn">loguru</span> <span class="kn">import</span> <span class="n">logger</span>

Expand Down Expand Up @@ -212,6 +213,11 @@ <h1>Source code for data_juicer.ops.base_op</h1><div class="highlight"><pre>
<span class="bp">self</span><span class="o">.</span><span class="n">image_key</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;image_key&#39;</span><span class="p">,</span> <span class="s1">&#39;images&#39;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">audio_key</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;audio_key&#39;</span><span class="p">,</span> <span class="s1">&#39;audios&#39;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">video_key</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;video_key&#39;</span><span class="p">,</span> <span class="s1">&#39;videos&#39;</span><span class="p">)</span>

<span class="bp">self</span><span class="o">.</span><span class="n">query_key</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;query_key&#39;</span><span class="p">,</span> <span class="s1">&#39;query&#39;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">response_key</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;response_key&#39;</span><span class="p">,</span> <span class="s1">&#39;response&#39;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">history_key</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;history_key&#39;</span><span class="p">,</span> <span class="s1">&#39;history&#39;</span><span class="p">)</span>

<span class="bp">self</span><span class="o">.</span><span class="n">batch_size</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;batch_size&#39;</span><span class="p">,</span> <span class="mi">1000</span><span class="p">)</span>

<span class="c1"># whether the model can be accelerated using cuda</span>
Expand Down Expand Up @@ -289,6 +295,9 @@ <h1>Source code for data_juicer.ops.base_op</h1><div class="highlight"><pre>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">NestedDataset</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
<span class="k">return</span> <span class="n">dataset</span>

<span class="k">def</span> <span class="nf">empty_history</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">empty</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="nb">str</span><span class="p">)</span>


<div class="viewcode-block" id="Mapper"><a class="viewcode-back" href="../../../data_juicer.ops.html#data_juicer.ops.Mapper">[docs]</a><span class="k">class</span> <span class="nc">Mapper</span><span class="p">(</span><span class="n">OP</span><span class="p">):</span>

Expand Down
Loading

0 comments on commit e3b3a6d

Please sign in to comment.