Skip to content

Commit

Permalink
Update documentation. Add BST inorder functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
MaksimEkin committed May 4, 2024
1 parent b9237b8 commit e8d26e4
Show file tree
Hide file tree
Showing 14 changed files with 72 additions and 46 deletions.
15 changes: 11 additions & 4 deletions TELF/factorization/NMFk.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,8 +637,9 @@ def __init__(
k_search_method : str, optional
Which approach to use when searching for the rank or k. The default is "linear".\n
* ``k_search_method='linear'`` will linearly visit each K given in ``Ks`` hyper-parameter of the ``fit()`` function.\n
* ``k_search_method='bst_post'`` will perform post-order binary search. When an ideal rank is found, determined by the selected ``predict_k_method``, all lower ranks are pruned from the search space.
* ``k_search_method='bst_pre'`` will perform pre-order binary search. When an ideal rank is found, determined by the selected ``predict_k_method``, all lower ranks are pruned from the search space.
* ``k_search_method='bst_post'`` will perform post-order binary search. When an ideal rank is found, determined by the selected ``predict_k_method``, all lower ranks are pruned from the search space.\n
* ``k_search_method='bst_pre'`` will perform pre-order binary search. When an ideal rank is found, determined by the selected ``predict_k_method``, all lower ranks are pruned from the search space.\n
* ``k_search_method='bst_in'`` will perform in-order binary search. When an ideal rank is found, determined by the selected ``predict_k_method``, all lower ranks are pruned from the search space.
H_sill_thresh : float, optional
Setting for removing higher ranks from the search space.\n
When searching for the optimal rank with binary search using ``k_search='bst_post'`` or ``k_search='bst_pre'``, this hyper-parameter can be used to cut off higher ranks from search space.\n
Expand Down Expand Up @@ -694,7 +695,7 @@ def __init__(
self.H_sill_thresh = H_sill_thresh

# warnings
assert self.k_search_method in ["linear", "bst_pre", "bst_post"], "Invalid k_search_method method. Choose from linear, bst_pre, or bst_post."
assert self.k_search_method in ["linear", "bst_pre", "bst_post", "bst_in"], "Invalid k_search_method method. Choose from linear, bst_pre, bst_in, or bst_post."
assert self.predict_k_method in ["pvalue", "WH_sill", "W_sill", "H_sill", "sill"], "Invalid predict_k_method method. Choose from pvalue, WH_sill, W_sill, H_sill, or sill. sill defaults to WH_sill."

if self.predict_k_method == "sill":
Expand Down Expand Up @@ -863,11 +864,17 @@ def fit(self, X, Ks, name="NMFk", note=""):
node = BST.sorted_array_to_bst(Ks)
if self.K_search_settings["k_search_method"] == "bst_pre":
Ks = list(node.preorder())
if self.K_search_settings["k_search_method"] == "bst_post":
elif self.K_search_settings["k_search_method"] == "bst_post":
Ks = list(node.postorder())
elif self.K_search_settings["k_search_method"] == "bst_in":
Ks = list(node.inorder())
else:
raise Exception("Unknown k_search_method!")

if self.verbose:
print(f'Performing K search with {self.K_search_settings["k_search_method"]}. Ks={Ks}')


#
# check X format
#
Expand Down
1 change: 1 addition & 0 deletions docs/NMFk.html
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,7 @@ <h1>Available Functions<a class="headerlink" href="#available-functions" title="
<li><p><code class="docutils literal notranslate"><span class="pre">k_search_method='linear'</span></code> will linearly visit each K given in <code class="docutils literal notranslate"><span class="pre">Ks</span></code> hyper-parameter of the <code class="docutils literal notranslate"><span class="pre">fit()</span></code> function.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">k_search_method='bst_post'</span></code> will perform post-order binary search. When an ideal rank is found, determined by the selected <code class="docutils literal notranslate"><span class="pre">predict_k_method</span></code>, all lower ranks are pruned from the search space.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">k_search_method='bst_pre'</span></code> will perform pre-order binary search. When an ideal rank is found, determined by the selected <code class="docutils literal notranslate"><span class="pre">predict_k_method</span></code>, all lower ranks are pruned from the search space.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">k_search_method='bst_in'</span></code> will perform in-order binary search. When an ideal rank is found, determined by the selected <code class="docutils literal notranslate"><span class="pre">predict_k_method</span></code>, all lower ranks are pruned from the search space.</p></li>
</ul>
</dd>
<dt>H_sill_thresh<span class="classifier">float, optional</span></dt><dd><p>Setting for removing higher ranks from the search space.</p>
Expand Down
1 change: 1 addition & 0 deletions docs/TELF.factorization.html
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,7 @@ <h2>Submodules<a class="headerlink" href="#submodules" title="Link to this headi
<li><p><code class="docutils literal notranslate"><span class="pre">k_search_method='linear'</span></code> will linearly visit each K given in <code class="docutils literal notranslate"><span class="pre">Ks</span></code> hyper-parameter of the <code class="docutils literal notranslate"><span class="pre">fit()</span></code> function.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">k_search_method='bst_post'</span></code> will perform post-order binary search. When an ideal rank is found, determined by the selected <code class="docutils literal notranslate"><span class="pre">predict_k_method</span></code>, all lower ranks are pruned from the search space.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">k_search_method='bst_pre'</span></code> will perform pre-order binary search. When an ideal rank is found, determined by the selected <code class="docutils literal notranslate"><span class="pre">predict_k_method</span></code>, all lower ranks are pruned from the search space.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">k_search_method='bst_in'</span></code> will perform in-order binary search. When an ideal rank is found, determined by the selected <code class="docutils literal notranslate"><span class="pre">predict_k_method</span></code>, all lower ranks are pruned from the search space.</p></li>
</ul>
</dd>
<dt>H_sill_thresh<span class="classifier">float, optional</span></dt><dd><p>Setting for removing higher ranks from the search space.</p>
Expand Down
2 changes: 1 addition & 1 deletion docs/TELF.pre_processing.Vulture.html
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ <h2>Submodules<a class="headerlink" href="#submodules" title="Link to this headi

<dl class="py attribute">
<dt class="sig sig-object py" id="TELF.pre_processing.Vulture.vulture.Vulture.DEFAULT_PIPELINE">
<span class="sig-name descname"><span class="pre">DEFAULT_PIPELINE</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">[SimpleCleaner(module_type='CLEANER',</span> <span class="pre">effective_stop_words=['characteristics',</span> <span class="pre">'characteristic',</span> <span class="pre">'acknowledgment',</span> <span class="pre">'unfortunately',</span> <span class="pre">'significantly',</span> <span class="pre">'investigation',</span> <span class="pre">'predominantly',</span> <span class="pre">'substantially',</span> <span class="pre">'automatically',</span> <span class="pre">'corresponding',</span> <span class="pre">'approximately',</span> <span class="pre">'consequently',</span> <span class="pre">'representing',</span> <span class="pre">'respectively',</span> <span class="pre">'particularly',</span> <span class="pre">'demonstrates',</span> <span class="pre">'nevertheless',</span> <span class="pre">'sufficiently',</span> <span class="pre">'introduction',</span> <span class="pre">'applications',</span> <span class="pre">'successfully',</span> <span class="pre">'specifically',</span> <span class="pre">'demonstrated',</span> <span class="pre">'background:',</span> <span class="pre">'application',</span> <span class="pre">...</span> <span class="pre">(+1359</span> <span class="pre">more)],</span> <span class="pre">patterns={'standardize_hyphens':</span> <span class="pre">(re.compile('[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\u2212\\u2E3A\\u2E3B]'),</span> <span class="pre">'-'),</span> <span class="pre">'remove_copyright_statement':</span> <span class="pre">None,</span> <span class="pre">'remove_stop_phrases':</span> <span class="pre">None,</span> <span class="pre">'make_lower_case':</span> <span class="pre">None,</span> <span class="pre">'normalize':</span> <span class="pre">None,</span> <span class="pre">'remove_trailing_dash':</span> <span class="pre">('(?&lt;!\\w)-|-(?!\\w)',</span> <span class="pre">''),</span> <span class="pre">'make_hyphens_words':</span> <span class="pre">('([a-z])\\-([a-z])',</span> <span class="pre">''),</span> <span class="pre">'remove_next_line':</span> <span class="pre">('\\n+',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_email':</span> <span class="pre">('\\S*&#64;\\S*\\s?',</span> <span class="pre">''),</span> <span class="pre">'remove_formulas':</span> <span class="pre">('\\b\\w*[\\=\\≈\\/\\\\\\±]\\w*\\b',</span> <span class="pre">''),</span> <span class="pre">'remove_dash':</span> <span class="pre">('-',</span> <span class="pre">''),</span> <span class="pre">'remove_between_[]':</span> <span class="pre">('\\[.*?\\]',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_between_()':</span> <span class="pre">('\\(.*?\\)',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_[]':</span> <span class="pre">('[\\[\\]]',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_()':</span> <span class="pre">('[()]',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_\\':</span> <span class="pre">('\\\\',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_numbers':</span> <span class="pre">('\\d+',</span> <span class="pre">''),</span> <span class="pre">'remove_standalone_numbers':</span> <span class="pre">('\\b\\d+\\b',</span> <span class="pre">''),</span> <span class="pre">'remove_nonASCII_boundary':</span> <span class="pre">('\\b[^\\x00-\\x7F]+\\b',</span> <span class="pre">''),</span> <span class="pre">'remove_nonASCII':</span> <span class="pre">('[^\\x00-\\x7F]+',</span> <span class="pre">''),</span> <span class="pre">'remove_tags':</span> <span class="pre">('&amp;lt;/?.*?&amp;gt;',</span> <span class="pre">''),</span> <span class="pre">'remove_special_characters':</span> <span class="pre">('[!|&quot;|#|$|%|&amp;|\\|\\\'|(|)|*|+|,|.|/|:|;|&lt;|=|&gt;|?|&#64;|[|\\|]|^|_|`|{|\\||}|~]',</span> <span class="pre">''),</span> <span class="pre">'isolate_frozen':</span> <span class="pre">None,</span> <span class="pre">'remove_extra_whitespace':</span> <span class="pre">('\\s+',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_stop_words':</span> <span class="pre">None,</span> <span class="pre">'min_characters':</span> <span class="pre">None},</span> <span class="pre">exclude_hyphenated_stopwords=False,</span> <span class="pre">sw_pattern=re.compile('\\b[\\w-]+\\b'))]</span></em><a class="headerlink" href="#TELF.pre_processing.Vulture.vulture.Vulture.DEFAULT_PIPELINE" title="Link to this definition">#</a></dt>
<span class="sig-name descname"><span class="pre">DEFAULT_PIPELINE</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">[SimpleCleaner(module_type='CLEANER',</span> <span class="pre">effective_stop_words=['characteristics',</span> <span class="pre">'characteristic',</span> <span class="pre">'acknowledgment',</span> <span class="pre">'unfortunately',</span> <span class="pre">'substantially',</span> <span class="pre">'corresponding',</span> <span class="pre">'predominantly',</span> <span class="pre">'investigation',</span> <span class="pre">'automatically',</span> <span class="pre">'significantly',</span> <span class="pre">'approximately',</span> <span class="pre">'applications',</span> <span class="pre">'specifically',</span> <span class="pre">'representing',</span> <span class="pre">'demonstrated',</span> <span class="pre">'introduction',</span> <span class="pre">'demonstrates',</span> <span class="pre">'consequently',</span> <span class="pre">'particularly',</span> <span class="pre">'respectively',</span> <span class="pre">'successfully',</span> <span class="pre">'sufficiently',</span> <span class="pre">'nevertheless',</span> <span class="pre">'nonetheless',</span> <span class="pre">'significant',</span> <span class="pre">...</span> <span class="pre">(+1359</span> <span class="pre">more)],</span> <span class="pre">patterns={'standardize_hyphens':</span> <span class="pre">(re.compile('[\\u002D\\u2010\\u2011\\u2012\\u2013\\u2014\\u2015\\u2212\\u2E3A\\u2E3B]'),</span> <span class="pre">'-'),</span> <span class="pre">'remove_copyright_statement':</span> <span class="pre">None,</span> <span class="pre">'remove_stop_phrases':</span> <span class="pre">None,</span> <span class="pre">'make_lower_case':</span> <span class="pre">None,</span> <span class="pre">'normalize':</span> <span class="pre">None,</span> <span class="pre">'remove_trailing_dash':</span> <span class="pre">('(?&lt;!\\w)-|-(?!\\w)',</span> <span class="pre">''),</span> <span class="pre">'make_hyphens_words':</span> <span class="pre">('([a-z])\\-([a-z])',</span> <span class="pre">''),</span> <span class="pre">'remove_next_line':</span> <span class="pre">('\\n+',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_email':</span> <span class="pre">('\\S*&#64;\\S*\\s?',</span> <span class="pre">''),</span> <span class="pre">'remove_formulas':</span> <span class="pre">('\\b\\w*[\\=\\≈\\/\\\\\\±]\\w*\\b',</span> <span class="pre">''),</span> <span class="pre">'remove_dash':</span> <span class="pre">('-',</span> <span class="pre">''),</span> <span class="pre">'remove_between_[]':</span> <span class="pre">('\\[.*?\\]',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_between_()':</span> <span class="pre">('\\(.*?\\)',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_[]':</span> <span class="pre">('[\\[\\]]',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_()':</span> <span class="pre">('[()]',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_\\':</span> <span class="pre">('\\\\',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_numbers':</span> <span class="pre">('\\d+',</span> <span class="pre">''),</span> <span class="pre">'remove_standalone_numbers':</span> <span class="pre">('\\b\\d+\\b',</span> <span class="pre">''),</span> <span class="pre">'remove_nonASCII_boundary':</span> <span class="pre">('\\b[^\\x00-\\x7F]+\\b',</span> <span class="pre">''),</span> <span class="pre">'remove_nonASCII':</span> <span class="pre">('[^\\x00-\\x7F]+',</span> <span class="pre">''),</span> <span class="pre">'remove_tags':</span> <span class="pre">('&amp;lt;/?.*?&amp;gt;',</span> <span class="pre">''),</span> <span class="pre">'remove_special_characters':</span> <span class="pre">('[!|&quot;|#|$|%|&amp;|\\|\\\'|(|)|*|+|,|.|/|:|;|&lt;|=|&gt;|?|&#64;|[|\\|]|^|_|`|{|\\||}|~]',</span> <span class="pre">''),</span> <span class="pre">'isolate_frozen':</span> <span class="pre">None,</span> <span class="pre">'remove_extra_whitespace':</span> <span class="pre">('\\s+',</span> <span class="pre">'</span> <span class="pre">'),</span> <span class="pre">'remove_stop_words':</span> <span class="pre">None,</span> <span class="pre">'min_characters':</span> <span class="pre">None},</span> <span class="pre">exclude_hyphenated_stopwords=False,</span> <span class="pre">sw_pattern=re.compile('\\b[\\w-]+\\b'))]</span></em><a class="headerlink" href="#TELF.pre_processing.Vulture.vulture.Vulture.DEFAULT_PIPELINE" title="Link to this definition">#</a></dt>
<dd></dd></dl>

<dl class="py attribute">
Expand Down
Loading

0 comments on commit e8d26e4

Please sign in to comment.