Skip to content

Commit

Permalink
deploy: 213f7f8
Browse files Browse the repository at this point in the history
  • Loading branch information
drcege committed Aug 22, 2024
1 parent fec586d commit 8403546
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 66 deletions.
6 changes: 2 additions & 4 deletions _modules/data_juicer/core/analyzer.html
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ <h1>Source code for data_juicer.core.analyzer</h1><div class="highlight"><pre>
<span class="bp">self</span><span class="o">.</span><span class="n">cfg</span> <span class="o">=</span> <span class="n">init_configs</span><span class="p">()</span> <span class="k">if</span> <span class="n">cfg</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">cfg</span>

<span class="bp">self</span><span class="o">.</span><span class="n">work_dir</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cfg</span><span class="o">.</span><span class="n">work_dir</span>
<span class="bp">self</span><span class="o">.</span><span class="n">ops</span> <span class="o">=</span> <span class="kc">None</span>

<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cfg</span><span class="o">.</span><span class="n">use_cache</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;Using cache compression method: &#39;</span>
Expand Down Expand Up @@ -163,13 +162,12 @@ <h1>Source code for data_juicer.core.analyzer</h1><div class="highlight"><pre>

<span class="c1"># extract processes</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s1">&#39;Preparing process operators...&#39;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">cfg</span><span class="o">.</span><span class="n">process</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">ops</span> <span class="o">=</span> <span class="n">load_ops</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">cfg</span><span class="o">.</span><span class="n">process</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">cfg</span><span class="o">.</span><span class="n">op_fusion</span><span class="p">)</span>
<span class="n">ops</span> <span class="o">=</span> <span class="n">load_ops</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">cfg</span><span class="o">.</span><span class="n">process</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">cfg</span><span class="o">.</span><span class="n">op_fusion</span><span class="p">)</span>

<span class="c1"># 2. stats precompute only for filter ops</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s1">&#39;Computing the stats of dataset...&#39;</span><span class="p">)</span>
<span class="n">stats_collected</span> <span class="o">=</span> <span class="kc">False</span>
<span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">ops</span><span class="p">:</span>
<span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">ops</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">Filter</span><span class="p">):</span>
<span class="n">original_process</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">process</span>
<span class="n">op</span><span class="o">.</span><span class="n">process</span> <span class="o">=</span> <span class="kc">None</span>
Expand Down
45 changes: 27 additions & 18 deletions _modules/data_juicer/core/data.html
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ <h1>Source code for data_juicer.core.data</h1><div class="highlight"><pre>

<span class="kn">import</span> <span class="nn">copy</span>
<span class="kn">import</span> <span class="nn">inspect</span>
<span class="kn">import</span> <span class="nn">traceback</span>
<span class="kn">from</span> <span class="nn">abc</span> <span class="kn">import</span> <span class="n">ABC</span><span class="p">,</span> <span class="n">abstractmethod</span>
<span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">wraps</span>
<span class="kn">from</span> <span class="nn">time</span> <span class="kn">import</span> <span class="n">time</span>
Expand Down Expand Up @@ -258,24 +259,32 @@ <h1>Source code for data_juicer.core.data</h1><div class="highlight"><pre>
<span class="n">unforkable_operators</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">UNFORKABLE</span><span class="o">.</span><span class="n">modules</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>

<span class="n">dataset</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">operators</span><span class="p">:</span>
<span class="n">mp_context</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;forkserver&#39;</span><span class="p">,</span> <span class="s1">&#39;spawn&#39;</span><span class="p">]</span> <span class="k">if</span> <span class="p">(</span>
<span class="n">op</span><span class="o">.</span><span class="n">use_cuda</span><span class="p">()</span> <span class="ow">or</span> <span class="n">op</span><span class="o">.</span><span class="n">_name</span> <span class="ow">in</span> <span class="n">unforkable_operators</span><span class="p">)</span> <span class="k">else</span> <span class="kc">None</span>
<span class="n">setup_mp</span><span class="p">(</span><span class="n">mp_context</span><span class="p">)</span>

<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="p">()</span>
<span class="c1"># run single op</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">op</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span>
<span class="n">exporter</span><span class="o">=</span><span class="n">exporter</span><span class="p">,</span>
<span class="n">checkpointer</span><span class="o">=</span><span class="n">checkpointer</span><span class="p">,</span>
<span class="n">tracer</span><span class="o">=</span><span class="n">tracer</span><span class="p">)</span>
<span class="c1"># record processed ops</span>
<span class="k">if</span> <span class="n">checkpointer</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">checkpointer</span><span class="o">.</span><span class="n">record</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">_name</span><span class="p">,</span>
<span class="nb">list</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">_process_kwargs</span><span class="o">.</span><span class="n">values</span><span class="p">())[</span><span class="mi">0</span><span class="p">])</span>
<span class="n">end</span> <span class="o">=</span> <span class="n">time</span><span class="p">()</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;OP [</span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">_name</span><span class="si">}</span><span class="s1">] Done in </span><span class="si">{</span><span class="n">end</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">start</span><span class="si">:</span><span class="s1">.3f</span><span class="si">}</span><span class="s1">s. &#39;</span>
<span class="sa">f</span><span class="s1">&#39;Left </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span><span class="si">}</span><span class="s1"> samples.&#39;</span><span class="p">)</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">for</span> <span class="n">op</span> <span class="ow">in</span> <span class="n">operators</span><span class="p">:</span>
<span class="n">mp_context</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;forkserver&#39;</span><span class="p">,</span> <span class="s1">&#39;spawn&#39;</span><span class="p">]</span> <span class="k">if</span> <span class="p">(</span>
<span class="n">op</span><span class="o">.</span><span class="n">use_cuda</span><span class="p">()</span>
<span class="ow">or</span> <span class="n">op</span><span class="o">.</span><span class="n">_name</span> <span class="ow">in</span> <span class="n">unforkable_operators</span><span class="p">)</span> <span class="k">else</span> <span class="kc">None</span>
<span class="n">setup_mp</span><span class="p">(</span><span class="n">mp_context</span><span class="p">)</span>

<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="p">()</span>
<span class="c1"># run single op</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">op</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">dataset</span><span class="p">,</span> <span class="n">exporter</span><span class="o">=</span><span class="n">exporter</span><span class="p">,</span> <span class="n">tracer</span><span class="o">=</span><span class="n">tracer</span><span class="p">)</span>
<span class="c1"># record processed ops</span>
<span class="k">if</span> <span class="n">checkpointer</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">checkpointer</span><span class="o">.</span><span class="n">record</span><span class="p">(</span><span class="n">op</span><span class="o">.</span><span class="n">_of_cfg</span><span class="p">)</span>
<span class="n">end</span> <span class="o">=</span> <span class="n">time</span><span class="p">()</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;OP [</span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">_name</span><span class="si">}</span><span class="s1">] Done in </span><span class="si">{</span><span class="n">end</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">start</span><span class="si">:</span><span class="s1">.3f</span><span class="si">}</span><span class="s1">s. &#39;</span>
<span class="sa">f</span><span class="s1">&#39;Left </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span><span class="si">}</span><span class="s1"> samples.&#39;</span><span class="p">)</span>
<span class="k">except</span><span class="p">:</span> <span class="c1"># noqa: E722</span>
<span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;An error occurred during Op [</span><span class="si">{</span><span class="n">op</span><span class="o">.</span><span class="n">_name</span><span class="si">}</span><span class="s1">].&#39;</span><span class="p">)</span>
<span class="n">traceback</span><span class="o">.</span><span class="n">print_exc</span><span class="p">()</span>
<span class="n">exit</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="k">finally</span><span class="p">:</span>
<span class="k">if</span> <span class="n">checkpointer</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s1">&#39;Writing checkpoint of dataset processed by &#39;</span>
<span class="s1">&#39;last op...&#39;</span><span class="p">)</span>
<span class="n">dataset</span><span class="o">.</span><span class="n">cleanup_cache_files</span><span class="p">()</span>
<span class="n">checkpointer</span><span class="o">.</span><span class="n">save_ckpt</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
<span class="k">return</span> <span class="n">dataset</span></div>

<div class="viewcode-block" id="NestedDataset.map"><a class="viewcode-back" href="../../../data_juicer.core.html#data_juicer.core.NestedDataset.map">[docs]</a> <span class="k">def</span> <span class="nf">map</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kargs</span><span class="p">):</span>
Expand Down
Loading

0 comments on commit 8403546

Please sign in to comment.